Section 0: Answers to questions

Question 1

Final Prediction Model (Decision Tree):

# Question 1
# Final Prediction Model (Decision Tree):

require(RCurl)
sit = getURLContent('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', binary=TRUE, followlocation = TRUE, ssl.verifypeer = FALSE)
con = gzcon(rawConnection(sit, 'rb'))
source(con)
close(con)

## Get data, this is original file downloaded without any changes
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
library(xts)
marketclose_xts <- xts(data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff(log(marketclose_xts))

#Estimate historical relative volatility
library(quantmod)
ret.log <- ROC(marketclose_xts, type='continuous')
hist.vol <- runSD(ret.log, n = 21)
vol.rank <- percent.rank(SMA(percent.rank(hist.vol, 252), 21), 250)

## Mean reversion feature
rsi2_values <- RSI(marketclose_xts,2)

## Trend following feature
sma.short <-  SMA(marketclose_xts, 5)
sma.long <- SMA(marketclose_xts, 20)

sig_opt_optimal <- Lag(iif(vol.rank > 0.2, 
                           iif(rsi2_values < 36  , 1, -1),
                           iif(sma.short < sma.long | rsi2_values > 80, -1, 1)
))
ret_opt_optimal <- 1000*momentum(marketclose_xts)*sig_opt_optimal
# Verify sig_opt_optimal=1 gives ret_opt_optimal= market pnl
# it seems momentum function divides the difference by 10
eq_opt_optimal <- (cumsum(ret_opt_optimal[-c(1)]))

marketclose_weekday <- base::weekdays(date_ts,abbreviate=TRUE)
## Modeling Monday returns
marketclose_ret_Mon <- marketclose_ret[marketclose_weekday == "Mon"]
## Monday returns clearly show a visible pattern, we can create a time series for returns and use its predictability to improve the earlier forecast
## Switching strategy based on Monday returns:

## Mean reversion
rsi2_values_Mon = RSI(marketclose_ret_Mon/100,2)

## Trend following
sma.short_Mon <- SMA(marketclose_ret_Mon/100, 2)
sma.long_Mon <-  SMA(marketclose_ret_Mon/100, 5)
library(quantmod)
ret.log_Mon = marketclose_ret_Mon/100
hist.vol_Mon = runSD(ret.log_Mon, n = 5)
vol.rank_Mon = percent.rank(SMA(percent.rank(hist.vol_Mon, 52), 5), 50)

sig_opt.Mon <-  Lag(Lag(iif(vol.rank_Mon > 0.44, 
                            iif((rsi2_values_Mon < 62) , 1, -1),
                            iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 60)), -1, 1)
)))

sig_opt_optimal.MonCorrection <- sig_opt_optimal
sig_opt_optimal.MonCorrection[marketclose_weekday == "Mon"][-c(1,2)] <- sig_opt.Mon[-c(1,2)]

ret_opt_optimal.MonCorrection <- 1000*momentum(marketclose_xts)[-c(1,2)]*sig_opt_optimal.MonCorrection
eq_opt_optimal.MonCorrection <- as.numeric( cumsum( (ret_opt_optimal.MonCorrection) ))

lastday_row <- length(marketclose_ret)

prediction_opt <- iif(vol.rank[lastday_row] > 0.2, 
                      iif(rsi2_values[lastday_row] < 36  , 1, -1),
                      iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values > 80, -1, 1)
)

# Predicts -1
print(paste0("Prediction for next day:  ",prediction_opt))
[1] "Prediction for next day:  -1"

Question 2

Model cumulative PNL vs market PNL



plot(c(NA,eq_opt_optimal.MonCorrection)~date_ts[-c(1)],col="red",type='l', ylim=c(0,140000),xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt_optimal) ~ date_ts[-c(1)],col="blue")
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="black")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("Model Cumulative PNL (Monday trend correction) Backtesting","Model Cumulative PNL: Backtesting","Market Cumulative PNL"),
       fill=c("red","blue","black"))

Model Development

Section I: Data Cleaning (Python)

# This snippet is written in python3
# import pandas as pd
# import numpy as np
# df = pd.read_csv('DataSet.csv')
# 
# for column in df.columns:
#   print(column)
#   print(df[df[column].isnull()].index.tolist())
# 
# df['FEATURE_3'] = df['FEATURE_3'].str.replace('%',"")
# df['DATE'] = pd.to_datetime(df['DATE'], format="%m/%d/%Y")
#
# #### We can use a dictionary to map the string pattern to nan
# #### cleaning_dict = {'^#.*': np.nan}
# #### df['FEATURE_3'] = df['FEATURE_3'].replace(cleaning_dict, regex=True)
# #### df['BINARY_FEATURE'] = df['BINARY_FEATURE'].replace(cleaning_dict, regex=True)
# # OR Using errors='coerce' will replace all non-numeric values with NaN
# df['BINARY_FEATURE'] = pd.to_numeric(df['BINARY_FEATURE'],errors='coerce')
# df['FEATURE_3'] = pd.to_numeric(df['FEATURE_3'],errors='coerce')
# #### Drop all nan
# df_rmna = df.dropna()
# #### Save file
# df_rmna.to_csv('DataSet_rmNA.csv')

Section II: Data Exploration (R)

setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
marketclose_ts <- ts(data_in$MARKETCLOSE, start=date_ts[1])
#print(marketclose_ts)
plot.ts(marketclose_ts)

library(xts)
marketclose_xts <- xts(x=data_in$MARKETCLOSE,order.by = date_ts)
autoplot(marketclose_xts)

# Split marketclose by week and calculate averages per week
weekly_marketclose_xts <- split(marketclose_xts, f = "weeks")
print(head(weekly_marketclose_xts[[1]]))
             [,1]
2002-07-01 109.86
2002-07-02 109.76
2002-07-03 109.83
print(head(weekly_marketclose_xts[[2]]))
             [,1]
2002-07-08 109.18
2002-07-09 109.27
2002-07-10 109.80
2002-07-11 110.06
2002-07-12 110.42
dates_weekly_endpoints <- xts::endpoints(marketclose_xts,on='weeks')
readings_per_week <- dates_weekly_endpoints[-c(1)]-dates_weekly_endpoints[-c(length(dates_weekly_endpoints))]

# print weeks with less than 5 values per week
for (weekID in 1:length(weekly_marketclose_xts)) {
  if (length(weekly_marketclose_xts[[weekID]][,1]) < 5) {
    week_len <- length(weekly_marketclose_xts[[weekID]])
    week_endpoint_ID <- dates_weekly_endpoints[1+weekID]
    print(paste("WeekID:",weekID,", num_readings:",week_len))
    print(paste("Week_start:",as.character(date_ts[week_endpoint_ID-week_len+1]),
    ", Week_end:",as.character(date_ts[week_endpoint_ID]) ) )
  }
}
[1] "WeekID: 1 , num_readings: 3"
[1] "Week_start: 2002-07-01 , Week_end: 2002-07-03"
[1] "WeekID: 10 , num_readings: 4"
[1] "Week_start: 2002-09-03 , Week_end: 2002-09-06"
[1] "WeekID: 22 , num_readings: 3"
[1] "Week_start: 2002-11-25 , Week_end: 2002-11-27"
[1] "WeekID: 26 , num_readings: 4"
[1] "Week_start: 2002-12-23 , Week_end: 2002-12-27"
[1] "WeekID: 27 , num_readings: 4"
[1] "Week_start: 2002-12-30 , Week_end: 2003-01-03"
[1] "WeekID: 30 , num_readings: 4"
[1] "Week_start: 2003-01-21 , Week_end: 2003-01-24"
[1] "WeekID: 34 , num_readings: 4"
[1] "Week_start: 2003-02-18 , Week_end: 2003-02-21"
[1] "WeekID: 42 , num_readings: 4"
[1] "Week_start: 2003-04-14 , Week_end: 2003-04-17"
[1] "WeekID: 48 , num_readings: 4"
[1] "Week_start: 2003-05-27 , Week_end: 2003-05-30"
[1] "WeekID: 53 , num_readings: 4"
[1] "Week_start: 2003-06-30 , Week_end: 2003-07-03"
[1] "WeekID: 62 , num_readings: 4"
[1] "Week_start: 2003-09-02 , Week_end: 2003-09-05"
[1] "WeekID: 74 , num_readings: 3"
[1] "Week_start: 2003-11-24 , Week_end: 2003-11-26"
[1] "WeekID: 78 , num_readings: 3"
[1] "Week_start: 2003-12-22 , Week_end: 2003-12-24"
[1] "WeekID: 79 , num_readings: 3"
[1] "Week_start: 2003-12-29 , Week_end: 2003-12-31"
[1] "WeekID: 82 , num_readings: 4"
[1] "Week_start: 2004-01-20 , Week_end: 2004-01-23"
[1] "WeekID: 86 , num_readings: 4"
[1] "Week_start: 2004-02-17 , Week_end: 2004-02-20"
[1] "WeekID: 101 , num_readings: 4"
[1] "Week_start: 2004-06-01 , Week_end: 2004-06-04"
[1] "WeekID: 102 , num_readings: 4"
[1] "Week_start: 2004-06-07 , Week_end: 2004-06-10"
[1] "WeekID: 106 , num_readings: 4"
[1] "Week_start: 2004-07-06 , Week_end: 2004-07-09"
[1] "WeekID: 115 , num_readings: 4"
[1] "Week_start: 2004-09-07 , Week_end: 2004-09-10"
[1] "WeekID: 126 , num_readings: 3"
[1] "Week_start: 2004-11-22 , Week_end: 2004-11-24"
[1] "WeekID: 130 , num_readings: 4"
[1] "Week_start: 2004-12-20 , Week_end: 2004-12-23"
[1] "WeekID: 131 , num_readings: 4"
[1] "Week_start: 2004-12-27 , Week_end: 2004-12-30"
[1] "WeekID: 134 , num_readings: 4"
[1] "Week_start: 2005-01-18 , Week_end: 2005-01-21"
[1] "WeekID: 139 , num_readings: 4"
[1] "Week_start: 2005-02-22 , Week_end: 2005-02-25"
[1] "WeekID: 143 , num_readings: 4"
[1] "Week_start: 2005-03-21 , Week_end: 2005-03-24"
[1] "WeekID: 153 , num_readings: 4"
[1] "Week_start: 2005-05-31 , Week_end: 2005-06-03"
[1] "WeekID: 158 , num_readings: 4"
[1] "Week_start: 2005-07-05 , Week_end: 2005-07-08"
[1] "WeekID: 167 , num_readings: 4"
[1] "Week_start: 2005-09-06 , Week_end: 2005-09-09"
[1] "WeekID: 178 , num_readings: 3"
[1] "Week_start: 2005-11-21 , Week_end: 2005-11-23"
[1] "WeekID: 183 , num_readings: 4"
[1] "Week_start: 2005-12-27 , Week_end: 2005-12-30"
[1] "WeekID: 184 , num_readings: 4"
[1] "Week_start: 2006-01-03 , Week_end: 2006-01-06"
[1] "WeekID: 186 , num_readings: 4"
[1] "Week_start: 2006-01-17 , Week_end: 2006-01-20"
[1] "WeekID: 191 , num_readings: 4"
[1] "Week_start: 2006-02-21 , Week_end: 2006-02-24"
[1] "WeekID: 198 , num_readings: 4"
[1] "Week_start: 2006-04-10 , Week_end: 2006-04-13"
[1] "WeekID: 205 , num_readings: 4"
[1] "Week_start: 2006-05-30 , Week_end: 2006-06-02"
[1] "WeekID: 210 , num_readings: 3"
[1] "Week_start: 2006-07-05 , Week_end: 2006-07-07"
[1] "WeekID: 235 , num_readings: 4"
[1] "Week_start: 2006-12-26 , Week_end: 2006-12-29"
[1] "WeekID: 236 , num_readings: 4"
[1] "Week_start: 2007-01-02 , Week_end: 2007-01-05"
weekly_avg_marketclose <- lapply(X = weekly_marketclose_xts, function(X){return(mean(X))})
all_weeks <- ts(unlist(weekly_avg_marketclose))
filtered_weeks <- all_weeks
filtered_weeks[readings_per_week<5] <- NaN
#filtered_weeks <- ts(unlist(weekly_avg_marketclose)[readings_per_week==5])
closes <- cbind(all_weeks, filtered_weeks)
autoplot(closes)

#autoplot(ts(unlist(weekly_avg_marketclose)))

It would be interesting to see effects of holidays/long weekends, annual events (eg superbowl, quad witching). We will ignore them for now. Other factors include sector specific sentiment or whole market sentiment, which hopefully decomposition below will uncover.

Number of daily readings by year:


table(format(date_ts,"%Y"))

2002 2003 2004 2005 2006 2007 
 126  250  250  251  252   44 

Number of weekly readings by year:

table(format(date_ts[dates_weekly_endpoints[-c(1)]],"%Y"))

2002 2003 2004 2005 2006 2007 
  26   53   52   52   52    9 

Number of trading days per trading week (avg) by year

table(format(date_ts,"%Y"))/table(format(date_ts[dates_weekly_endpoints[-c(1)]],"%Y"))

    2002     2003     2004     2005     2006     2007 
4.846154 4.716981 4.807692 4.826923 4.846154 4.888889 

Number of trading days per month by year

dates_monthly_endpoints <- xts::endpoints(marketclose_xts,on='months')
table(format(date_ts[dates_monthly_endpoints[-c(1)]],"%Y"))

2002 2003 2004 2005 2006 2007 
   6   12   12   12   12    3 
table(format(date_ts,"%Y"))/table(format(date_ts[dates_monthly_endpoints[-c(1)]],"%Y"))

    2002     2003     2004     2005     2006     2007 
21.00000 20.83333 20.83333 20.91667 21.00000 14.66667 
table(format(date_ts,"%m/%Y",start=("07/2002")))

01/2003 01/2004 01/2005 01/2006 01/2007 02/2003 02/2004 02/2005 02/2006 02/2007 03/2003 
     21      19      20      20      22      19      19      19      19      20      21 
03/2004 03/2005 03/2006 03/2007 04/2003 04/2004 04/2005 04/2006 05/2003 05/2004 05/2005 
     23      22      23       2      21      22      21      19      21      20      21 
05/2006 06/2003 06/2004 06/2005 06/2006 07/2002 07/2003 07/2004 07/2005 07/2006 08/2002 
     22      21      21      22      22      21      22      21      20      19      22 
08/2003 08/2004 08/2005 08/2006 09/2002 09/2003 09/2004 09/2005 09/2006 10/2002 10/2003 
     21      22      23      23      20      21      21      21      21      23      23 
10/2004 10/2005 10/2006 11/2002 11/2003 11/2004 11/2005 11/2006 12/2002 12/2003 12/2004 
     21      21      22      19      18      20      20      22      21      21      21 
12/2005 12/2006 
     21      20 
mean(table(format(date_ts,"%m/%Y")))
[1] 20.57895
table(format(date_ts,"%Y"))

2002 2003 2004 2005 2006 2007 
 126  250  250  251  252   44 

Last reading date for each year:

date_ts[xts::endpoints(marketclose_xts,on='years')[-c(1)]]
[1] "2002-12-31" "2003-12-31" "2004-12-30" "2005-12-30" "2006-12-29" "2007-03-02"

Year 2003,04,05,06 seasonal trends:

weekly_avg_marketclose_2003456  <- ts(as.numeric(unlist(weekly_avg_marketclose))[(26+1):(26+53+52+52+52)],frequency=52)
#library(forecast)
no_transform_2003456 <- stl(log(weekly_avg_marketclose_2003456), s.window=53)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(no_transform_2003456)

#TimeSeriesWeeklyDecomposed<-stl(ts_data , s.window="periodic")

Seasonal trends:

library(forecast)
This is forecast 8.16 
  Need help getting started? Try the online textbook FPP:
  http://otexts.com/fpp2/
no_transform_2003456 %>% seasonal() %>% ggsubseriesplot()

no_transform_2003456 %>% remainder() %>% autoplot()

rm(list=ls())
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
marketclose_ts <- ts(data_in$MARKETCLOSE, start=date_ts[1],frequency = 251.625)
plot.ts(marketclose_ts)

Given the irregular nature of the time series we use xts modeling and estimate daily returns %


library(xts)
marketclose_xts <- xts(x=data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff.xts(log(marketclose_xts))
#marketclose_ret <- as.zoo(marketclose_ret[-c(1)])
plot.zoo(marketclose_ret, ylab = "Daily returns (%)", main = "Percentage daily returns")
# lowess fit with the f = 1/average # trading days in a month
# Calculation for 20.8 follow below
lines(index(marketclose_ret), lowess(marketclose_ret[,1], f = 1/20.8)$y, col = "red", lwd = 2)

# Volatility as function of month
plot(jitter(as.numeric(format(index(marketclose_ret),"%m")), amount = 1/3), marketclose_ret, pch = 20, ylab = "Daily percent returns", xlab = "Month", bty = "l")

boxplot(as.vector(marketclose_ret) ~ factor(quarters(index(marketclose_ret),abbreviate = TRUE),labels=c("Q3","Q4","Q1","Q2")), xlab = "Quarter", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

boxplot(as.vector(marketclose_ret) ~ factor(months(index(marketclose_ret),abbreviate = TRUE),     levels = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun")), xlab = "Month", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),
        levels = c("Mon","Tue","Wed","Thu","Fri")) ,xlab = "Day of the week", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(months(index(marketclose_ret),abbreviate = TRUE),levels = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun")),
xlab = "Day of the week in given month", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(quarters(index(marketclose_ret),abbreviate = TRUE),levels = c("Q3","Q4","Q1","Q2")),
xlab = "Day of the week in given year", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(format(index(marketclose_ret),"%Y"),levels = c("2002","2003","2004","2005","2006","2007")),
xlab = "Day of the week in given year", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)

Monday returns as function of return on friday return in prev week by year

#head(which(factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))=="Mon"))
#head(which(factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))=="Fri"))


monday_indices <- which(weekdays(index(marketclose_ret),abbreviate = TRUE)=="Mon")
monday_indices <- monday_indices[-c(1)]
plot(as.vector(marketclose_ret[monday_indices])~as.vector(marketclose_ret[(monday_indices-1)]),
     xlab="Prev day change",ylab="Monday change")
lines(lowess(as.vector(marketclose_ret[monday_indices])~as.vector(marketclose_ret[(monday_indices-1)]),f=1/2),col="red")
grid(col=1,lwd=1.5)


plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/2),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/2),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/2),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/2),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/2),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)

plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/4),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/4),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/4),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/4),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/4),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)

plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/8),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/8),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/8),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/8),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/8),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)

Even with f=1/8 it seems Monday daily returns will follow strictly downward trend

#tail(lowess(as.vector(marketclose_ret[c((monday_indices-1),1173)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-2),1172)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-3),1171)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-4),1170)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices))]),f=1/8)$y,15)

plot(tail(lowess(as.vector(marketclose_ret[c((monday_indices-1),1173)]),f=1/8)$y,50),ylim=c(-1,1),type='l',main="trend 50 day returns by day of the week",ylab="% Change",xlab="day")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-2),1172)]),f=1/8)$y,50),col="blue")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-3),1171)]),f=1/8)$y,50),col="green")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-4),1170)]),f=1/8)$y,50),col="orange")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices))]),f=1/8)$y,50),col="red")
grid(col=1,lwd=1.5)

plot(c(rep(0,50),rollmean(as.vector(marketclose_ret),50)),ylim=c(-1,1),type='l',main="50 & 200 day MA of daily returns",ylab="% Change",xlab="day",col="red")
lines(c(rep(0,200),rollmean(as.vector(marketclose_ret),200)),col="green")
#lines(as.vector(marketclose_ret))
lines(lowess(marketclose_ret[,1], f = 1/20.8)$y, lwd = 2)
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),50),col="blue")
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),50),col="green")
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),50),col="orange")
#lines(rollmean(as.vector(marketclose_ret[monday_indices]),50),col="red")
grid(col=1,lwd=1.5)

plot(rollmean(as.vector(marketclose_ret[c((monday_indices-1),1173)]),50),ylim=c(-.25,.25),type='l',main="50 week MA of weekly returns by day of the week",ylab="% Change",xlab="day")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),50),col="blue")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),50),col="green")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),50),col="orange")
lines(rollmean(as.vector(marketclose_ret[monday_indices]),50),col="red")
grid(col=1,lwd=1.5)

plot(rollmean(as.vector(marketclose_ret[c((monday_indices-1),1173)]),200),ylim=c(-.25,.25),type='l',main="200 week MA returns by day of the week",ylab="% Change",xlab="day")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),200),col="blue")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),200),col="green")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),200),col="orange")
lines(rollmean(as.vector(marketclose_ret[monday_indices]),200),col="red")
grid(col=1,lwd=1.5)

monday return wrt prev monday

indices_2007 <- which(as.numeric(format(index(marketclose_ret),"%Y")) %in% c(2006,2007))
monday_indices_2007 <- monday_indices[monday_indices %in% indices_2007]
plot(as.vector(marketclose_ret[monday_indices_2007])~as.vector(marketclose_ret[(monday_indices_2007-1)]),
     xlab="Prev day change 2006-2007",ylab="Monday change 2006-2007")
lines(lowess(as.vector(marketclose_ret[monday_indices_2007])~as.vector(marketclose_ret[(monday_indices_2007-1)]),f=1/2),col="red")
grid(col=1,lwd=1.5)

#plot.zoo(xts(sign(as.numeric(marketclose_ret)[monday_indices_2007])*sign(as.numeric(marketclose_ret)[(monday_indices_2007-1)]),order.by = index(marketclose_ret)[monday_indices_2007]),main= "Sign of Monday change times sign of prev day")
#lines(xts(sign(as.numeric(marketclose_ret)[monday_indices_2007]),order.by = index(marketclose_ret)[monday_indices_2007]),col="red")
#lines(sign(as.numeric(marketclose_ret)[monday_indices_2007]))
plot(lowess(as.vector(marketclose_ret[(monday_indices_2007-1)]),f=1/2),col="green",ylim=c(-1,1),
     xlab="Date",ylab = "% Change")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007)]),f=1/2),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007-2)]),f=1/2),col="blue",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(sign(as.numeric(marketclose_ret)[monday_indices_2007]))
grid(col=1,lwd=1.5)

Plot of features

setwd("~/Documents/interview_prep/Question1")
data_in_cleaned <- read.csv(file = 'DataSet_rmNA.csv')
date_ts_cleaned <- as.Date(data_in_cleaned$DATE, format = "%Y-%m-%d")
marketclose_ts_cleaned <- ts(data_in_cleaned$MARKETCLOSE, start=date_ts_cleaned[1],frequency = 251.625)
plot.ts(marketclose_ts_cleaned)

library(xts)
marketclose_xts_cleaned <- xts(x=data_in_cleaned$MARKETCLOSE,order.by = date_ts_cleaned)
marketclose_ret_cleaned <- 100*diff.xts(log(marketclose_xts_cleaned))
## marketclose_ret_cleaned <- marketclose_ret_cleaned[-c(1)]
#marketclose_ret <- as.zoo(marketclose_ret[-c(1)])
plot.zoo(marketclose_ret_cleaned, ylab = "Daily returns (%)", main = "Percentage daily returns")
# lowess fit with the f = 1/average # trading days in a month
# Calculation for 20.8 follow below
lines(index(marketclose_ret_cleaned), lowess(marketclose_ret_cleaned[,1], f = 1/20.8)$y, col = "red", lwd = 2)
feature1_xts <- xts(x=data_in_cleaned$FEATURE_1,order.by = date_ts_cleaned)
feature1_diff_xts <- diff.xts((feature1_xts))
## feature1_diff_xts <- feature1_diff_xts[-c(1)]
lines(index(feature1_diff_xts), lowess(feature1_diff_xts[,1], f = 1/20.8)$y, col = "blue", lwd = 2)
feature2_xts <- xts(x=data_in_cleaned$FEATURE_2,order.by = date_ts_cleaned)
feature2_diff_xts <- diff.xts((feature2_xts))
lines(index(feature2_diff_xts), lowess(feature2_diff_xts[,1], f = 1/20.8)$y, col = "green", lwd = 2)
## feature2_diff_xts <- feature2_diff_xts[-c(1)]

feature3_xts <- xts(x=data_in_cleaned$FEATURE_3,order.by = date_ts_cleaned)
feature3_scaled_xts <- ((feature3_xts))/100
lines(index(feature3_scaled_xts), lowess(feature3_scaled_xts[,1], f = 1/20.8)$y, col = "orange", lwd = 2)
## feature3_scaled_xts <-  feature3_scaled_xts[-c(1)]

binaryf_xts <- xts(x=data_in_cleaned$BINARY_FEATURE,order.by = date_ts_cleaned)
## binaryf_xts <- binaryf_xts[-c(1)]
lines(index(binaryf_xts),binaryf_xts, col = "yellow", lwd = 2)

Cross corr between time seriers

title_logret <- "Daily log returns (%)"
par(mfrow = c(2, 2))
TSA::acf(marketclose_ret_cleaned, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(marketclose_ret_cleaned, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Daily abs log returns (%)"
TSA::acf(abs(marketclose_ret_cleaned), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(marketclose_ret_cleaned), na.action = na.pass, lag.max = 100, main = "")

# (P)ACF of squared daily returns
#par(mfrow = c(1, 2))
#TSA::acf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
#pacf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
title_logret <- "Feature1 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature1_diff_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature1_diff_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature1 abs log returns"
TSA::acf(abs(feature1_diff_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature1_diff_xts), na.action = na.pass, lag.max = 100, main = "")

title_logret <- "Feature2 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature2_diff_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature2_diff_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature2 abs log returns"
TSA::acf(abs(feature2_diff_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature2_diff_xts), na.action = na.pass, lag.max = 100, main = "")

title_logret <- "Feature3 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature3_scaled_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature3_scaled_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature3 abs log returns"
TSA::acf(abs(feature3_scaled_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature3_scaled_xts), na.action = na.pass, lag.max = 100, main = "")

Plot ret vs f1 f2 f3

par(mfrow =c(1,3) )

marketclose_ret_cleaned_ <- marketclose_ret_cleaned[-c(1)]
feature1_diff_xts_ <- feature1_diff_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature1_diff_xts_) ,xlab="Feature1",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature1_diff_xts_), f=0.1),
      col="red")

feature2_diff_xts_ <- feature2_diff_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned) ~ as.numeric(feature2_diff_xts) ,xlab="Feature2",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature2_diff_xts_), f=0.1),
      col="red")

feature3_scaled_xts_ <- feature3_scaled_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature3_scaled_xts_) ,xlab="Feature3",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature3_scaled_xts_), f=0.1),
      col="red")

sel_fit <- auto.arima(marketclose_ret_cleaned_, xreg = cbind(feature2_diff_xts_  ,feature3_scaled_xts_))
plot(resid(sel_fit))

Box.test(resid(sel_fit), lag=100, type="Ljung")

    Box-Ljung test

data:  resid(sel_fit)
X-squared = 187.87, df = 100, p-value = 2.508e-07
qqnorm(resid(sel_fit, type="innovation"))

marketclose_ts <- ts(as.numeric(data_in_cleaned$MARKETCLOSE), start=date_ts_cleaned[1],frequency = 251.625)
marketclose_ts_decomp <- stl(log(marketclose_ts), s.window=21, robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
forecast::autoplot(marketclose_ts_decomp)

#TimeSeriesWeeklyDecomposed<-stl(ts_data , s.window="periodic")
feature1_ts <- ts(as.numeric(feature1_xts), start=date_ts[1],frequency = 251.625)
feature1_ts_decomp <- stl(log(feature1_ts), s.window="periodic", robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(feature1_ts_decomp)

feature2_ts <- ts(as.numeric(feature2_xts), start=date_ts[1],frequency = 251.625)
feature2_ts_decomp <- stl(log(feature2_ts), s.window="periodic", robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(feature2_ts_decomp)

#monthplot(marketclose_ts_decomp,choice="seasonal")
#plot(seasonal(feature1_ts_decomp))
#lines(seasonal(marketclose_ts_decomp),col=2)
plot(forecast::remainder(marketclose_ts_decomp))
feature1_ts <- ts(data_in_cleaned$FEATURE_1,start = date_ts_cleaned[1],frequency = 251.625)
#lines(-(feature1_ts)/100,col=2)
feature2_ts <- ts(data_in_cleaned$FEATURE_2,start = date_ts_cleaned[1],frequency = 251.625)
#lines(-(feature2_ts)/100,col="green")
feature3_ts <- ts(data_in_cleaned$FEATURE_3,start = date_ts_cleaned[1],frequency = 251.625)
lines(((feature3_ts)/10000),col="blue")


#lines(((feature3_ts/1000))^{1/3}/10,col="blue")
#lines(remainder(marketclose_ts_decomp))

resid_decomp <- forecast::remainder(marketclose_ts_decomp)
resid_decomp_logret <- diff((resid_decomp))
title_logret <- "Daily log returns (%)"
par(mfrow = c(2, 2))
TSA::acf(resid_decomp_logret, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(resid_decomp_logret, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Daily abs log returns (%)"
TSA::acf(abs(resid_decomp_logret), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(resid_decomp_logret), na.action = na.pass, lag.max = 100, main = "")

# (P)ACF of squared daily returns
#par(mfrow = c(1, 2))
#TSA::acf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
#pacf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
plot(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature2_ts)),xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature2_ts)), f=1/5),
      col="red")
grid(col=1,lwd=1.5)

plot(as.numeric(resid_decomp_logret) ~ as.numeric((feature3_ts)/100)[-c(1)],xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric((feature3_ts)/100)[-c(1)], f=1/5),
      col="red")
grid(col=1,lwd=1.5)

plot(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature1_ts)),xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature1_ts)), f=1/5),
      col="red")
grid(col=1,lwd=1.5)

for_stl <- forecast(marketclose_ts_decomp, h = 21)
plot(for_stl)

exp(for_stl$mean)
Time Series:
Start = 11884.6299056135 
End = 11884.7093889717 
Frequency = 251.625 
 [1] 137.1074 137.8653 137.4644 137.7676 138.6953 138.5616 139.3502 139.1598 138.8544
[10] 137.4894 136.3955 136.3485 136.0867 135.9724 136.1907 136.4010 137.3733 137.1660
[19] 137.2883 136.3747 136.3359
tail(marketclose_ts)
Time Series:
Start = 11884.6060606061 
End = 11884.6259314456 
Frequency = 251.625 
[1] 136.65 137.24 135.97 137.53 137.55 137.07

Section III: Model Development and analysis (R)

Daily Volatility based Trend Following and Mean Reversion Switching

(Please clear the environment and all variables, starting with a clean slate from here.)

I haven’t considered the provided features in the data for this model development since I don’t know what they represent, how they were extracted and whether or not they were lagged by one trading day. Further it seems all of them are endogenous (since they seem to be extracted from the price data), so they shouldn’t have any extra ‘information’ (in information theoretical sense of the word) even though they might be better predictors. However, here I focus on interpretability of the strategy in terms of variables well known in financial trading.

To enable interpretability of the model, I use well-defined features or features representing well-defined quantities related to a stock price eg. volatility, moving averages, relative strength index etc. The model essentially switches between following a trend and mean reversion based on the stock volatility (relative to its historical values).

The basic strategy is this: when the stock volatility is high relative to its usual volatility, I use mean reversion strategy based on RSI(2). When the volatility is low I follow the trend based on MA. In addition in slow volatility regime if the RSI(2) gets very high, this indicates oversold condition in which case I short. I further show improvement by updating the threshold parameters every 5 days by identifying best model parameters using grid search. Performance is judged on the basis of cumulative PNL wrt always buy strategy. The prediction for next also improves and is -1 from the updating strategy.

This strategy can be potentially further improved by using momentum, which I expect to be particularly strong during downturns. But it has to be balanced against being ‘too-late’ trying to time the stock price. Another potential improvement is to use volatility forecast using a Markov switching based GARCH model (since they seem to have been shown more accurate in predicting short term volatility <week than general GARCH models) or GARCH-ARIMA model. I use R here instead of python, however a small snippet of code to cleaup the data is in python3.

rm(list=ls())
## Setup
## Get some functions/packages
## install.packages("RCurl")
require(RCurl)
sit = getURLContent('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', binary=TRUE, followlocation = TRUE, ssl.verifypeer = FALSE)
con = gzcon(rawConnection(sit, 'rb'))
source(con)
close(con)
## Get data, this is original file downloaded without any changes
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
library(xts)
marketclose_xts <- xts(data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff(log(marketclose_xts))
#Estimate historical relative volatility
library(quantmod)
ret.log <- ROC(marketclose_xts, type='continuous')
hist.vol <- runSD(ret.log, n = 21)
vol.rank <- percent.rank(SMA(percent.rank(hist.vol, 252), 21), 250)
## Mean reversion feature
rsi2_values <- RSI(marketclose_xts,2)

## Trend following feature
sma.short <-  SMA(marketclose_xts, 5)
sma.long <- SMA(marketclose_xts, 20)
## Trading strategy: High Volatility - Mean reversion, Low volatility - follow trend
# long if vol.rank > 0.50 and rsi2_values < 50 (mean reversion) short otherwise
# short if vol.rank < 0.50 and 
# rsi2_values > 80 (overbought) or 5 week MA < 20 week MA
# long otherwise
# Add lag to predict next value from current estimates
# 
# `iif` is more stable version of `ifelse()` function in R to gracefully handle `NA` and `Inf` entries

sig <- Lag(iif(vol.rank > 0.50, 
               iif(rsi2_values < 50 , 1, -1),
               iif(sma.short < sma.long | rsi2_values > 80 , -1, 1)
))
## Evaluate
# We evaluate the strategy against always buy strategy in column CUMSUM in data. Momentum is just the difference of successive entries. Noting here that sig is delayed by 1 trading day.
print('CUMSUM column from data:')
print(head(data_in$CUMSUM[-c(1)]))
print("Always buy strategy")
ret <- 1000*momentum(marketclose_xts)*1
print(head(cumsum(ret[-c(1)])))
## Plot cumulative PNL
ret <- 1000*momentum(marketclose_xts)*sig
eq <- (cumsum(ret[-c(1)]))
plot(coredata(eq)~date_ts[-c(1)],type='l',xlab= "Year", ylab = "Cumulative PNL")
grid(col=1,lwd=1)
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
legend(x="topleft",legend=c("Volatility based switching","Buy Daily (Given)"),
       fill=c("black","red"))
## Compare cumulative PNL at the end
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'Proposed Strategy'=tail(as.vector(eq)),row.names = as.character(tail(date_ts))))
## Optimize thresholding parameters using grid search:
# Parameters: vol.rank_MIN, rsi2_values_MAX, rsi2_values_MIN
# sig.test <-  (iif(vol.rank.test > vol.rank_MIN, 
#                   iif((rsi2_values.test < rsi2_values_MAX) , 1, -1),
#                   iif(sma.short.test < sma.long.test | ((rsi2_values.test > rsi2_values_MIN)), -1, 1)

# we will use foreach for faster for loops
# automatic install of packages if they are not installed already
list.of.packages <- c(
  "foreach",
  "doParallel",
  "ranger",
  "palmerpenguins",
  "tidyverse",
  "kableExtra"
)

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]

if(length(new.packages) > 0){
  install.packages(new.packages, dep=TRUE)
}

#loading packages
for(package.i in list.of.packages){
  suppressPackageStartupMessages(
    library(
      package.i, 
      character.only = TRUE
    )
  )
}
# Create subset arrays to test how much these parameters change over time
# Every 5 day update of parameters
sub_ser.len <- 5
start_point <- 700
Tmax_array <- seq(start_point,length(marketclose_xts),sub_ser.len)

# Grid search for parametrs that maximize cumulative return over subset array
# and store the prediction for next day
return_max.test <- matrix(-Inf,length(Tmax_array),1)
predict_array.test <- matrix(1,length(Tmax_array),1)
max_par.test <- matrix(0,length(Tmax_array),3)
# Read from saved file for retesting
max_par.test <- read.csv("max_par.test_5day700_30_rsi2MINrange.csv")[,c("V1","V2","V3")]

print("Starting grid search. This might take a while (5-10 minutes)")
foreach (i =  1:length(Tmax_array)) %do% {
  print(paste("Updating threshold parameters for the week",i,"of",length(Tmax_array)))
  Tmax <- Tmax_array[i]
  marketclose_xts.test <- marketclose_xts[1:Tmax]
  marketclose_ret.test <- 100*diff(log(marketclose_xts.test))[-c(1)]
  #Historical volatility
  ret.log.test = ROC(marketclose_xts.test, type='continuous')
  hist.vol.test = runSD(ret.log.test, n = 21)
  vol.rank.test = percent.rank(SMA(percent.rank(hist.vol.test, 252), 21), 250)
  # Mean reversion
  rsi2_values.test = RSI(marketclose_xts.test,2)
  # Trend following
  sma.short.test <-  SMA(marketclose_xts.test, 5)
  sma.long.test <- SMA(marketclose_xts.test, 20)
  # Grid search
  foreach (vol.rank_MIN = seq(0.1,0.8,0.01)) %do% { 
    for (rsi2_values_MAX in seq(30,80,2)) {
      for (rsi2_values_MIN in 80){ #seq(30,80,10)) { # No effect of changes
        # Prediction signal calculation
        sig.test <-  (iif(vol.rank.test > vol.rank_MIN, 
                          iif((rsi2_values.test < rsi2_values_MAX) , 1, -1),
                          iif(sma.short.test < sma.long.test | ((rsi2_values.test > rsi2_values_MIN)), -1, 1)
                          
        ))
        # Save this for future use
        predict.test <- tail(sig.test,1)
        
        # Lag signal by one trading day
        sig.test <- Lag(sig.test)
        ret.test <- 1000*momentum(marketclose_xts.test)*sig.test
        
        return_total.test <- tail(cumsum(ret.test[-c(1)]),1)

        # Save if Cumulative return is more than current maximum
        if (return_total.test > return_max.test[i]) {
          
          return_max.test[i] <- return_total.test
          max_par.test[i,1] <- vol.rank_MIN
          max_par.test[i,2] <- rsi2_values_MAX
          max_par.test[i,3] <- rsi2_values_MIN
          predict_array.test[i] <- predict.test
          sig.test.max <- sig.test
          
        }
        
      }
      
    }
    
  }
}
# Save for fututre reuse
# write.csv(max_par.test,"max_par.test_5day700_30_rsi2MINrange.csv")
# Print optimized parameters for each sub time series
print(max_par.test)
optimized.pars_5days <- data.frame('vol.rank_MIN'=max_par.test[,1],'rsi2_values_MAX'=max_par.test[,2],'rsi2_values_MIN'=max_par.test[,3],row.names = date_ts[Tmax_array])
print(optimized.pars_5days)
# We will use values (0.67,40,80) for dates after and not including 2004-09-10 and before and including 2004-11-19 and so on.

optimized.pars_5days <- data.frame('vol.rank_MIN'=rep(max_par.test[1:length(Tmax_array),1],each=sub_ser.len),'rsi2_values_MAX'=rep(max_par.test[1:length(Tmax_array),2],each=sub_ser.len),'rsi2_values_MIN'=rep(max_par.test[1:length(Tmax_array),3],each=sub_ser.len))
print(dim((optimized.pars_5days)))
# pre-pend values for first 700 days
initial.pars_700days <- data.frame('vol.rank_MIN'=rep(.50,each=start_point),'rsi2_values_MAX'=rep(50,each=start_point),'rsi2_values_MIN'=rep(80,each=start_point))
optimized.pars_5days <- rbind(initial.pars_700days,optimized.pars_5days)
optimized.pars_5days <- optimized.pars_5days[1:length(marketclose_xts),]
print(head(optimized.pars_5days))
print(tail(optimized.pars_5days))
## Evaluate performance of new parameters
## Optimized trading signal
sig_opt <- Lag(iif(vol.rank > optimized.pars_5days$vol.rank_MIN, 
                   iif(rsi2_values < optimized.pars_5days$rsi2_values_MAX  , 1, -1),
                   iif(sma.short < sma.long | rsi2_values > optimized.pars_5days$rsi2_values_MIN, -1, 1)
))
##Plot of cumulative returns
ret_opt <- 1000*momentum(marketclose_xts)*sig_opt
eq_opt <- (cumsum(ret_opt[-c(1)]))
plot(coredata(eq_opt)~date_ts[-c(1)],type='l',col="green",xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw: Live trading + backtesting","Initail par switching backtesting","Buy Daily (Given)"),
       fill=c("green","black","red"))
#Clearly new parameters perform better
# We can also use the entirety of time series for backtesting, to see what could have been the performance if we knew the best parameter estimates in the past (or try hit and trial/other optimization to improve overall performance with single set of parameters for all time points) and use it to further improve the update approach 
sig_opt_optimal <- Lag(iif(vol.rank > 0.2, 
                   iif(rsi2_values < 36  , 1, -1),
                   iif(sma.short < sma.long | rsi2_values > 80, -1, 1)
))
ret_opt_optimal <- 1000*momentum(marketclose_xts)*sig_opt_optimal
eq_opt_optimal <- (cumsum(ret_opt_optimal[-c(1)]))
plot(coredata(eq_opt_optimal)~date_ts[-c(1)],col="blue",type='l',xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt)~date_ts[-c(1)],col="green")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw Livetrading + Backtesting","Initial par switching: Backtesting","Buy Daily (Given)","Optimal par: Backtesting"),
       fill=c("green","black","red","blue"))
#Compare cumulative PNL at the end after improvements
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'FixParSwitching'=tail(as.vector(eq)),'5dayUpdateSw'=tail(as.vector(eq_opt)),'Optimal'=tail(as.vector(eq_opt_optimal)),row.names = as.character(tail(date_ts))))
# Predictions for next day using these parameters:
# Prediction from optimal will be same as opt since the parameters are the same
# on last day
lastday_row <- length(marketclose_ret)

prediction_fixed <- iif(vol.rank[lastday_row] > 0.50, 
                                   iif(rsi2_values[lastday_row] < 50 , 1, -1),
                                   iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values[lastday_row] > 80 , -1, 1)
)

prediction_opt <- iif(vol.rank[lastday_row] > optimized.pars_5days$vol.rank_MIN[lastday_row], 
                      iif(rsi2_values[lastday_row] < optimized.pars_5days$rsi2_values_MAX[lastday_row]  , 1, -1),
                      iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values > optimized.pars_5days$rsi2_values_MIN[lastday_row], -1, 1)
)

# Fixed parameter switching fails to predict correctly but 5 day update parameter update does correctly predict -1
print(paste0("Prediction for next day:  ",prediction_opt))

This was also evident in the exploration plots for weekday specific trends on returns

Section III: Model Development and analysis: Part II

Weekday Specific Update

[Using weekday specific daily return volatility (weekly bars) for TF and MR switching]

## Exploiting week of the day patterns in returns
marketclose_weekday <- base::weekdays(date_ts,abbreviate=TRUE)

## Modeling Monday returns
marketclose_ret_Mon <- marketclose_ret[marketclose_weekday == "Mon"]
plot(x=date_ts[marketclose_weekday == "Mon"],y=as.vector(marketclose_ret_Mon),ylim=c(-1.5,1.5),
     xlab="Year",ylab = "% Change",type = 'l',col="pink",
     main="% Weekly change in Monday daily log returns")
lines(lowess(as.vector(marketclose_ret_Mon) ~ date_ts[marketclose_weekday == "Mon"] ,f=1/2),col="red",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/4),col="blue",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/8),col="green",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/12),col="black",lwd=1)
grid(col=1,lwd=1)
legend(x = "bottomleft",legend=c("% log ret", "f=1/2 lowess","f=1/4", "f=1/8","f=1/2"),
       fill = c("pink","red","blue","green","black"), bty='n')
## Monday returns clearly show a visible pattern, we can create a time series for returns and use its predictability to improve the earlier forecast
## Switching strategy based on Monday returns:

## Mean reversion
rsi2_values_Mon = RSI(marketclose_ret_Mon/100,2)

## Trend following
sma.short_Mon <- SMA(marketclose_ret_Mon/100, 2)
sma.long_Mon <-  SMA(marketclose_ret_Mon/100, 5)

## Volatility in weekly bars of daily returns on Mondays
# Note that we had multiplied the changes by 100 when calculating `marketclose_ret`
library(quantmod)
ret.log_Mon = marketclose_ret_Mon/100
hist.vol_Mon = runSD(ret.log_Mon, n = 5)
vol.rank_Mon = percent.rank(SMA(percent.rank(hist.vol_Mon, 52), 5), 50)
## Switching strategy for Monday returns
sig_Mon_ret <-  (Lag(Lag(iif(vol.rank_Mon > 0.50, 
                         iif((rsi2_values_Mon < 50) , 1, -1),
                         iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 80)), -1, 1)
))))

## Cumulative PNL plots for predicting daily returns on Mondays
ret_Mon <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig_Mon_ret
eq_Mon <- (cumsum(ret_Mon[-c(1,2)]))
x_var_Mon <- (date_ts[marketclose_weekday == "Mon"])[-c(1,2)]
plot(as.numeric(eq_Mon)~x_var_Mon,type='l',xlab="Year",ylab="Cumulative returns on Mondays",
     ylim=c(-10000,30000))

eq_opt_Mon <- as.numeric( cumsum( (ret_opt[marketclose_weekday == "Mon"])[-c(1,2)]) )
lines(eq_opt_Mon ~ x_var_Mon,col="green")

eg_givenStr <- as.numeric( cumsum( (1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"])[-c(1,2)]) )
lines(eg_givenStr ~ x_var_Mon,col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("Monday prediction","Combined switching","Buy every Monday (Given)"),
       fill=c("black","green","red"))
## Optimizing parameters

# Create subset arrays to test how much these parameters change over time
# Every 10 week update of parameters
sub_ser.len.Mon <- 5
start_point.Mon <- 140
Tmax_array.Mon <- seq(start_point.Mon,length(marketclose_ret_Mon),sub_ser.len.Mon)

# Grid search for parameters that maximize cumulative return over subset time series
# and store the prediction for next day
return_max.test.Mon <- matrix(-Inf,length(Tmax_array.Mon),1)
predict_array.test.Mon <- matrix(1,length(Tmax_array.Mon),1)
max_par.test.Mon <- matrix(0,length(Tmax_array.Mon),3)
# Or directly read from here
max_par.test.Mon <- read.csv("max_par.test.Mon.csv")[,c("V1","V2","V3")]
print("Starting grid search. This might take a while (5-10 minutes)")
foreach (i =  1:length(Tmax_array.Mon)) %do% {
  print(paste("Updating threshold parameters for the 5 week Monday time series",i,"of",length(Tmax_array.Mon)))
  Tmax.Mon <- Tmax_array.Mon[i]
  marketclose_ret_Mon.test <- marketclose_ret_Mon[1:Tmax.Mon]
  # Differences in returns on Mondays
  marketclose_ret_ret_Mon.test <- 100*diff((marketclose_ret_Mon.test))[-c(1)]
  
  ## Volatility in weekly returns
  ret.log.test.Mon <- marketclose_ret_Mon.test/100
  hist.vol.test.Mon <- runSD(ret.log.test.Mon, n = 5)
  vol.rank.test.Mon <- percent.rank(SMA(percent.rank(hist.vol.test.Mon, 52), 5), 50)
  
  ## Mean reversion
  rsi2_values.test.Mon = RSI(marketclose_ret_Mon.test/100,2)
  ## Trend following
  sma.short.test.Mon <-  SMA(marketclose_ret_Mon.test/100, 2)
  sma.long.test.Mon <- SMA(marketclose_ret_Mon.test/100, 5)
  
  # Grid search for Monday parameters
  foreach (vol.rank.Mon_MIN = seq(0.3,0.8,0.01)) %do% {
    foreach (rsi2_values.Mon_MAX = seq(30,80,2)) %do% {
      for (rsi2_values.Mon_MIN in 60){ #seq(30,80,5))  %do%{ # doesn't change
        sig.test.Mon <-  (iif(vol.rank.test.Mon > vol.rank.Mon_MIN, 
                          iif((rsi2_values.test.Mon < rsi2_values.Mon_MAX) , 1, -1),
                          iif(sma.short.test.Mon < sma.long.test.Mon | ((rsi2_values.test.Mon > rsi2_values.Mon_MIN)), -1, 1)
                          
        ))
        predict.test.Mon <- tail(sig.test.Mon,1)
        sig.test.Mon <- Lag(Lag(sig.test.Mon))
        ret.test.Mon <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig.test.Mon
        #ret.test.Mon <- 1000*momentum(marketclose_ret_Mon.test)*sig.test.Mon
        
        return_total.test.Mon <- tail(cumsum(ret.test.Mon[-c(1,2)]),1)
        
        #Optimize cumulative return on Mondays
        if (return_total.test.Mon > return_max.test.Mon[i]) {
          
          return_max.test.Mon[i] <- return_total.test.Mon
          max_par.test.Mon[i,1] <- vol.rank.Mon_MIN
          max_par.test.Mon[i,2] <- rsi2_values.Mon_MAX
          max_par.test.Mon[i,3] <- rsi2_values.Mon_MIN
          predict_array.test.Mon[i] <- predict.test.Mon
          sig.test.Mon.max <- sig.test.Mon
          
        }
        
      }
      
    }
    
  }
}


#write.csv(max_par.test.Mon,"max_par.test.Mon.csv")
print(max_par.test.Mon)
# We use the optimal parameters for Monday update (0.44,62,60), to correct the earlier model
## Switching strategy for Monday returns
sig_opt.Mon <-  Lag(Lag(iif(vol.rank_Mon > 0.44, 
                             iif((rsi2_values_Mon < 62) , 1, -1),
                             iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 60)), -1, 1)
)))


##Plot of cumulative returns on Mondays after and before update
plot(as.numeric(eq_Mon)~x_var_Mon,type='l',xlab="Year",ylab="Cumulative returns on Mondays",
     ylim=c(-10000,30000))
lines(eq_opt_Mon ~ x_var_Mon,col="green")
lines(eg_givenStr ~ x_var_Mon,col="red")

ret_Mon_opt <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig_opt.Mon
eq_Mon_opt <- as.numeric( cumsum( (ret_Mon_opt)[-c(1,2)]) )
lines(eq_Mon_opt ~ x_var_Mon,col="blue")

grid(col=1,lwd=1)
legend(x="topleft",legend=c("5 week Monday update + 5 day update","Monday prediction","5 day update","Buy every Monday (Given)"),
       fill=c("blue","black","green","red"))
# Correcting the earlier plots
sig_opt_optimal.MonCorrection <- sig_opt_optimal
sig_opt_optimal.MonCorrection[marketclose_weekday == "Mon"][-c(1,2)] <- sig_opt.Mon[-c(1,2)]

ret_opt_optimal.MonCorrection <- 1000*momentum(marketclose_xts)[-c(1,2)]*sig_opt_optimal.MonCorrection
eq_opt_optimal.MonCorrection <- as.numeric( cumsum( (ret_opt_optimal.MonCorrection) ))
#Cumulative PNL plots for complete series after monday correction
plot(c(NA,eq_opt_optimal.MonCorrection)~date_ts[-c(1)],col="orange",type='l', ylim=c(0,140000),xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt_optimal)~date_ts[-c(1)],col="blue")
lines(coredata(eq_opt)~date_ts[-c(1)],col="green")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw","Fixed par switching","Buy Daily (Given)","Optimal","Optimal+Monday updates"),
       fill=c("green","black","red","blue","orange"))
# Similarly we can check for weekday patterns on other days and also potential monthly or quarterly patterns

## Since next day on which prediction is needed also falls on a Monday, we can also verify the prediction from Monday series only:
lastweek_row.Mon <- length(marketclose_ret_Mon)
prediction_fixed.Mon <- iif(vol.rank_Mon[lastweek_row.Mon] > 0.50, 
                            iif(rsi2_values_Mon[lastweek_row.Mon] < 50 , 1, -1),
                            iif((sma.short_Mon[lastweek_row.Mon] < sma.long_Mon[lastweek_row.Mon]) | (rsi2_values_Mon[lastweek_row.Mon] > 80) , -1, 1)
)

prediction_opt <- iif(vol.rank_Mon[lastweek_row.Mon] > 0.44, 
                      iif(rsi2_values_Mon[lastweek_row.Mon] < 62  , 1, -1),
                      iif((sma.short_Mon[lastweek_row.Mon] < sma.long_Mon[lastweek_row.Mon])  | (rsi2_values_Mon[lastweek_row.Mon] > 60), -1, 1)
)

# Again fixed parameter switching fails to predict correctly but 2 week parameter update does correctly predict -1
print(paste0("Prediction for next day (2 week parameter update):  ",prediction_opt))
Compare cumulative PNL at the end after Monday correction
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'FixParSwitching'=tail(as.vector(eq)),'5dayUpdateSw'=tail(as.vector(eq_opt)),'Optimal'=tail(as.vector(eq_opt_optimal)),'Optimal&MonUpdate'=tail(as.vector(eq_opt_optimal.MonCorrection)),row.names = as.character(tail(date_ts))))

write.csv(cbind("sig_opt_optimal.MonCorrection"=as.numeric(sig_opt_optimal.MonCorrection),"sig_opt_optimal"=as.numeric(sig_opt_optimal),"sig_opt"=as.numeric(sig_opt),'sig'=as.numeric(sig)),"trading_signals.csv",quote=FALSE,row.names = date_ts)
---
title: "R Notebook"
output: html_notebook
---


## Section 0: Answers to questions

### Question 1
### Final Prediction Model (Decision Tree):
```{r}
# Question 1
# Final Prediction Model (Decision Tree):

require(RCurl)
sit = getURLContent('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', binary=TRUE, followlocation = TRUE, ssl.verifypeer = FALSE)
con = gzcon(rawConnection(sit, 'rb'))
source(con)
close(con)

## Get data, this is original file downloaded without any changes
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
library(xts)
marketclose_xts <- xts(data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff(log(marketclose_xts))

#Estimate historical relative volatility
library(quantmod)
ret.log <- ROC(marketclose_xts, type='continuous')
hist.vol <- runSD(ret.log, n = 21)
vol.rank <- percent.rank(SMA(percent.rank(hist.vol, 252), 21), 250)

## Mean reversion feature
rsi2_values <- RSI(marketclose_xts,2)

## Trend following feature
sma.short <-  SMA(marketclose_xts, 5)
sma.long <- SMA(marketclose_xts, 20)

sig_opt_optimal <- Lag(iif(vol.rank > 0.2, 
                           iif(rsi2_values < 36  , 1, -1),
                           iif(sma.short < sma.long | rsi2_values > 80, -1, 1)
))
ret_opt_optimal <- 1000*momentum(marketclose_xts)*sig_opt_optimal
# Verify sig_opt_optimal=1 gives ret_opt_optimal= market pnl
# it seems momentum function divides the difference by 10
eq_opt_optimal <- (cumsum(ret_opt_optimal[-c(1)]))

marketclose_weekday <- base::weekdays(date_ts,abbreviate=TRUE)
## Modeling Monday returns
marketclose_ret_Mon <- marketclose_ret[marketclose_weekday == "Mon"]
## Monday returns clearly show a visible pattern, we can create a time series for returns and use its predictability to improve the earlier forecast
## Switching strategy based on Monday returns:

## Mean reversion
rsi2_values_Mon = RSI(marketclose_ret_Mon/100,2)

## Trend following
sma.short_Mon <- SMA(marketclose_ret_Mon/100, 2)
sma.long_Mon <-  SMA(marketclose_ret_Mon/100, 5)
library(quantmod)
ret.log_Mon = marketclose_ret_Mon/100
hist.vol_Mon = runSD(ret.log_Mon, n = 5)
vol.rank_Mon = percent.rank(SMA(percent.rank(hist.vol_Mon, 52), 5), 50)

sig_opt.Mon <-  Lag(Lag(iif(vol.rank_Mon > 0.44, 
                            iif((rsi2_values_Mon < 62) , 1, -1),
                            iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 60)), -1, 1)
)))

sig_opt_optimal.MonCorrection <- sig_opt_optimal
sig_opt_optimal.MonCorrection[marketclose_weekday == "Mon"][-c(1,2)] <- sig_opt.Mon[-c(1,2)]

ret_opt_optimal.MonCorrection <- 1000*momentum(marketclose_xts)[-c(1,2)]*sig_opt_optimal.MonCorrection
eq_opt_optimal.MonCorrection <- as.numeric( cumsum( (ret_opt_optimal.MonCorrection) ))

lastday_row <- length(marketclose_ret)

prediction_opt <- iif(vol.rank[lastday_row] > 0.2, 
                      iif(rsi2_values[lastday_row] < 36  , 1, -1),
                      iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values > 80, -1, 1)
)

# Predicts -1
print(paste0("Prediction for next day:  ",prediction_opt))
```

# Question 2
# Model cumulative PNL vs market PNL

```{r}


plot(c(NA,eq_opt_optimal.MonCorrection)~date_ts[-c(1)],col="red",type='l', ylim=c(0,140000),xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt_optimal) ~ date_ts[-c(1)],col="blue")
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="black")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("Model Cumulative PNL (Monday trend correction) Backtesting","Model Cumulative PNL: Backtesting","Market Cumulative PNL"),
       fill=c("red","blue","black"))
```




# Model Development

##                            Section I: Data Cleaning (Python)
```{python}
# This snippet is written in python3
# import pandas as pd
# import numpy as np
# df = pd.read_csv('DataSet.csv')
# 
# for column in df.columns:
#   print(column)
#   print(df[df[column].isnull()].index.tolist())
# 
# df['FEATURE_3'] = df['FEATURE_3'].str.replace('%',"")
# df['DATE'] = pd.to_datetime(df['DATE'], format="%m/%d/%Y")
#
# #### We can use a dictionary to map the string pattern to nan
# #### cleaning_dict = {'^#.*': np.nan}
# #### df['FEATURE_3'] = df['FEATURE_3'].replace(cleaning_dict, regex=True)
# #### df['BINARY_FEATURE'] = df['BINARY_FEATURE'].replace(cleaning_dict, regex=True)
# # OR Using errors='coerce' will replace all non-numeric values with NaN
# df['BINARY_FEATURE'] = pd.to_numeric(df['BINARY_FEATURE'],errors='coerce')
# df['FEATURE_3'] = pd.to_numeric(df['FEATURE_3'],errors='coerce')
# #### Drop all nan
# df_rmna = df.dropna()
# #### Save file
# df_rmna.to_csv('DataSet_rmNA.csv')
```

# Section II: Data Exploration (R)


```{r}
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
marketclose_ts <- ts(data_in$MARKETCLOSE, start=date_ts[1])
#print(marketclose_ts)
plot.ts(marketclose_ts)
```
```{r}
library(xts)
marketclose_xts <- xts(x=data_in$MARKETCLOSE,order.by = date_ts)
autoplot(marketclose_xts)
```

```{r}
# Split marketclose by week and calculate averages per week
weekly_marketclose_xts <- split(marketclose_xts, f = "weeks")
print(head(weekly_marketclose_xts[[1]]))
print(head(weekly_marketclose_xts[[2]]))

dates_weekly_endpoints <- xts::endpoints(marketclose_xts,on='weeks')
readings_per_week <- dates_weekly_endpoints[-c(1)]-dates_weekly_endpoints[-c(length(dates_weekly_endpoints))]

# print weeks with less than 5 values per week
for (weekID in 1:length(weekly_marketclose_xts)) {
  if (length(weekly_marketclose_xts[[weekID]][,1]) < 5) {
    week_len <- length(weekly_marketclose_xts[[weekID]])
    week_endpoint_ID <- dates_weekly_endpoints[1+weekID]
    print(paste("WeekID:",weekID,", num_readings:",week_len))
    print(paste("Week_start:",as.character(date_ts[week_endpoint_ID-week_len+1]),
    ", Week_end:",as.character(date_ts[week_endpoint_ID]) ) )
  }
}

```

```{r}
weekly_avg_marketclose <- lapply(X = weekly_marketclose_xts, function(X){return(mean(X))})
all_weeks <- ts(unlist(weekly_avg_marketclose))
filtered_weeks <- all_weeks
filtered_weeks[readings_per_week<5] <- NaN
#filtered_weeks <- ts(unlist(weekly_avg_marketclose)[readings_per_week==5])
closes <- cbind(all_weeks, filtered_weeks)
autoplot(closes)
#autoplot(ts(unlist(weekly_avg_marketclose)))
```


It would be interesting to see effects of holidays/long weekends, annual events (eg superbowl, quad witching). We will ignore them for now. Other factors include sector specific sentiment or whole market sentiment, which hopefully decomposition below will uncover. 

Number of daily readings by year:

```{r}

table(format(date_ts,"%Y"))
```

Number of weekly readings by year:

```{r}
table(format(date_ts[dates_weekly_endpoints[-c(1)]],"%Y"))
```

Number of trading days per trading week (avg) by year

```{r}
table(format(date_ts,"%Y"))/table(format(date_ts[dates_weekly_endpoints[-c(1)]],"%Y"))
```

Number of trading days per month by year

```{r}
dates_monthly_endpoints <- xts::endpoints(marketclose_xts,on='months')
table(format(date_ts[dates_monthly_endpoints[-c(1)]],"%Y"))
table(format(date_ts,"%Y"))/table(format(date_ts[dates_monthly_endpoints[-c(1)]],"%Y"))
```

```{r}
table(format(date_ts,"%m/%Y",start=("07/2002")))
```

```{r}
mean(table(format(date_ts,"%m/%Y")))
```


```{r}
table(format(date_ts,"%Y"))
```


Last reading date for each year:

```{r}
date_ts[xts::endpoints(marketclose_xts,on='years')[-c(1)]]
```

Year 2003,04,05,06 seasonal trends:
```{r}
weekly_avg_marketclose_2003456  <- ts(as.numeric(unlist(weekly_avg_marketclose))[(26+1):(26+53+52+52+52)],frequency=52)
#library(forecast)
no_transform_2003456 <- stl(log(weekly_avg_marketclose_2003456), s.window=53)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(no_transform_2003456)
#TimeSeriesWeeklyDecomposed<-stl(ts_data , s.window="periodic")
```

Seasonal trends:

```{r}
library(forecast)
no_transform_2003456 %>% seasonal() %>% ggsubseriesplot()
```

```{r}
no_transform_2003456 %>% remainder() %>% autoplot()
```


```{r}
rm(list=ls())
```

```{r}
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
marketclose_ts <- ts(data_in$MARKETCLOSE, start=date_ts[1],frequency = 251.625)
plot.ts(marketclose_ts)
```



Given the irregular nature of the time series we use xts modeling and estimate daily returns %

```{r}

library(xts)
marketclose_xts <- xts(x=data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff.xts(log(marketclose_xts))
#marketclose_ret <- as.zoo(marketclose_ret[-c(1)])
plot.zoo(marketclose_ret, ylab = "Daily returns (%)", main = "Percentage daily returns")
# lowess fit with the f = 1/average # trading days in a month
# Calculation for 20.8 follow below
lines(index(marketclose_ret), lowess(marketclose_ret[,1], f = 1/20.8)$y, col = "red", lwd = 2)
```

```{r}
# Volatility as function of month
plot(jitter(as.numeric(format(index(marketclose_ret),"%m")), amount = 1/3), marketclose_ret, pch = 20, ylab = "Daily percent returns", xlab = "Month", bty = "l")
```

```{r}
boxplot(as.vector(marketclose_ret) ~ factor(quarters(index(marketclose_ret),abbreviate = TRUE),labels=c("Q3","Q4","Q1","Q2")), xlab = "Quarter", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```

```{r}
boxplot(as.vector(marketclose_ret) ~ factor(months(index(marketclose_ret),abbreviate = TRUE),     levels = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun")), xlab = "Month", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```


```{r}
boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),
        levels = c("Mon","Tue","Wed","Thu","Fri")) ,xlab = "Day of the week", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```


```{r}
myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(months(index(marketclose_ret),abbreviate = TRUE),levels = c("Jul","Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun")),
xlab = "Day of the week in given month", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```

```{r}
myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(quarters(index(marketclose_ret),abbreviate = TRUE),levels = c("Q3","Q4","Q1","Q2")),
xlab = "Day of the week in given year", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```

```{r}
myplot <- boxplot(as.vector(marketclose_ret) ~ factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))*factor(format(index(marketclose_ret),"%Y"),levels = c("2002","2003","2004","2005","2006","2007")),
xlab = "Day of the week in given year", pch = 20, col = rgb(0, 0, 0, 0.4), ylab = "Daily percent returns", bty = "l")
grid(col=1,lwd=1.5)
```

Monday returns as function of return on friday return in prev week by year

```{r}
#head(which(factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))=="Mon"))
#head(which(factor(weekdays(index(marketclose_ret),abbreviate = TRUE),levels = c("Mon","Tue","Wed","Thu","Fri"))=="Fri"))


monday_indices <- which(weekdays(index(marketclose_ret),abbreviate = TRUE)=="Mon")
monday_indices <- monday_indices[-c(1)]
plot(as.vector(marketclose_ret[monday_indices])~as.vector(marketclose_ret[(monday_indices-1)]),
     xlab="Prev day change",ylab="Monday change")
lines(lowess(as.vector(marketclose_ret[monday_indices])~as.vector(marketclose_ret[(monday_indices-1)]),f=1/2),col="red")
grid(col=1,lwd=1.5)
```


```{r}

plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/2),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/2),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/2),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/2),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/2),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)
```
```{r}
plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/4),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/4),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/4),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/4),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/4),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)
```

```{r}
plot.ts(as.vector(marketclose_ret[(monday_indices)]),ylim=c(-1.5,1.5),
     xlab="Date",ylab = "% Change")
#lines(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),col=2)
lines(lowess(as.vector(marketclose_ret[(monday_indices-1)]),f=1/8),col="green",type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices)]),f=1/8),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices-2)]),f=1/8),col="blue",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(marketclose_ret[(monday_indices-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices)])),f=1/8),col="black",ylim=c(-1,1),type="o")
lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-1)])),f=1/8),col="orange",ylim=c(-1,1),type="o")
#lines(lowess(as.vector(100*diff.xts(log(marketclose_xts)[(monday_indices-2)])),f=1/2),col="yellow",ylim=c(-1,1),type="o")
#lines(sign(as.numeric(marketclose_ret)[monday_indices]))
grid(col=1,lwd=1.5)
```


Even with f=1/8 it seems Monday daily returns will follow strictly downward trend

```{r}
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-1),1173)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-2),1172)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-3),1171)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices-4),1170)]),f=1/8)$y,15)
#tail(lowess(as.vector(marketclose_ret[c((monday_indices))]),f=1/8)$y,15)

plot(tail(lowess(as.vector(marketclose_ret[c((monday_indices-1),1173)]),f=1/8)$y,50),ylim=c(-1,1),type='l',main="trend 50 day returns by day of the week",ylab="% Change",xlab="day")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-2),1172)]),f=1/8)$y,50),col="blue")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-3),1171)]),f=1/8)$y,50),col="green")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices-4),1170)]),f=1/8)$y,50),col="orange")
lines(tail(lowess(as.vector(marketclose_ret[c((monday_indices))]),f=1/8)$y,50),col="red")
grid(col=1,lwd=1.5)
```

```{r}
plot(c(rep(0,50),rollmean(as.vector(marketclose_ret),50)),ylim=c(-1,1),type='l',main="50 & 200 day MA of daily returns",ylab="% Change",xlab="day",col="red")
lines(c(rep(0,200),rollmean(as.vector(marketclose_ret),200)),col="green")
#lines(as.vector(marketclose_ret))
lines(lowess(marketclose_ret[,1], f = 1/20.8)$y, lwd = 2)
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),50),col="blue")
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),50),col="green")
#lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),50),col="orange")
#lines(rollmean(as.vector(marketclose_ret[monday_indices]),50),col="red")
grid(col=1,lwd=1.5)
```

```{r}
plot(rollmean(as.vector(marketclose_ret[c((monday_indices-1),1173)]),50),ylim=c(-.25,.25),type='l',main="50 week MA of weekly returns by day of the week",ylab="% Change",xlab="day")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),50),col="blue")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),50),col="green")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),50),col="orange")
lines(rollmean(as.vector(marketclose_ret[monday_indices]),50),col="red")
grid(col=1,lwd=1.5)
```
```{r}
plot(rollmean(as.vector(marketclose_ret[c((monday_indices-1),1173)]),200),ylim=c(-.25,.25),type='l',main="200 week MA returns by day of the week",ylab="% Change",xlab="day")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-2),1172)]),200),col="blue")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-3),1171)]),200),col="green")
lines(rollmean(as.vector(marketclose_ret[c((monday_indices-4),1170)]),200),col="orange")
lines(rollmean(as.vector(marketclose_ret[monday_indices]),200),col="red")
grid(col=1,lwd=1.5)
```

monday return wrt prev monday

```{r}
indices_2007 <- which(as.numeric(format(index(marketclose_ret),"%Y")) %in% c(2006,2007))
monday_indices_2007 <- monday_indices[monday_indices %in% indices_2007]
plot(as.vector(marketclose_ret[monday_indices_2007])~as.vector(marketclose_ret[(monday_indices_2007-1)]),
     xlab="Prev day change 2006-2007",ylab="Monday change 2006-2007")
lines(lowess(as.vector(marketclose_ret[monday_indices_2007])~as.vector(marketclose_ret[(monday_indices_2007-1)]),f=1/2),col="red")
grid(col=1,lwd=1.5)
```


```{r}
#plot.zoo(xts(sign(as.numeric(marketclose_ret)[monday_indices_2007])*sign(as.numeric(marketclose_ret)[(monday_indices_2007-1)]),order.by = index(marketclose_ret)[monday_indices_2007]),main= "Sign of Monday change times sign of prev day")
#lines(xts(sign(as.numeric(marketclose_ret)[monday_indices_2007]),order.by = index(marketclose_ret)[monday_indices_2007]),col="red")
#lines(sign(as.numeric(marketclose_ret)[monday_indices_2007]))
plot(lowess(as.vector(marketclose_ret[(monday_indices_2007-1)]),f=1/2),col="green",ylim=c(-1,1),
     xlab="Date",ylab = "% Change")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007)]),f=1/2),col="red",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007-2)]),f=1/2),col="blue",ylim=c(-1,1),type="o")
lines(lowess(as.vector(marketclose_ret[(monday_indices_2007-3)]),f=1/2),col="pink",ylim=c(-1,1),type="o")
lines(sign(as.numeric(marketclose_ret)[monday_indices_2007]))
grid(col=1,lwd=1.5)
```

## Plot of features



```{r}
setwd("~/Documents/interview_prep/Question1")
data_in_cleaned <- read.csv(file = 'DataSet_rmNA.csv')
date_ts_cleaned <- as.Date(data_in_cleaned$DATE, format = "%Y-%m-%d")
marketclose_ts_cleaned <- ts(data_in_cleaned$MARKETCLOSE, start=date_ts_cleaned[1],frequency = 251.625)
plot.ts(marketclose_ts_cleaned)
```

```{r}
library(xts)
marketclose_xts_cleaned <- xts(x=data_in_cleaned$MARKETCLOSE,order.by = date_ts_cleaned)
marketclose_ret_cleaned <- 100*diff.xts(log(marketclose_xts_cleaned))
## marketclose_ret_cleaned <- marketclose_ret_cleaned[-c(1)]
#marketclose_ret <- as.zoo(marketclose_ret[-c(1)])
plot.zoo(marketclose_ret_cleaned, ylab = "Daily returns (%)", main = "Percentage daily returns")
# lowess fit with the f = 1/average # trading days in a month
# Calculation for 20.8 follow below
lines(index(marketclose_ret_cleaned), lowess(marketclose_ret_cleaned[,1], f = 1/20.8)$y, col = "red", lwd = 2)
feature1_xts <- xts(x=data_in_cleaned$FEATURE_1,order.by = date_ts_cleaned)
feature1_diff_xts <- diff.xts((feature1_xts))
## feature1_diff_xts <- feature1_diff_xts[-c(1)]
lines(index(feature1_diff_xts), lowess(feature1_diff_xts[,1], f = 1/20.8)$y, col = "blue", lwd = 2)
feature2_xts <- xts(x=data_in_cleaned$FEATURE_2,order.by = date_ts_cleaned)
feature2_diff_xts <- diff.xts((feature2_xts))
lines(index(feature2_diff_xts), lowess(feature2_diff_xts[,1], f = 1/20.8)$y, col = "green", lwd = 2)
## feature2_diff_xts <- feature2_diff_xts[-c(1)]

feature3_xts <- xts(x=data_in_cleaned$FEATURE_3,order.by = date_ts_cleaned)
feature3_scaled_xts <- ((feature3_xts))/100
lines(index(feature3_scaled_xts), lowess(feature3_scaled_xts[,1], f = 1/20.8)$y, col = "orange", lwd = 2)
## feature3_scaled_xts <-  feature3_scaled_xts[-c(1)]

binaryf_xts <- xts(x=data_in_cleaned$BINARY_FEATURE,order.by = date_ts_cleaned)
## binaryf_xts <- binaryf_xts[-c(1)]
lines(index(binaryf_xts),binaryf_xts, col = "yellow", lwd = 2)
```

## Cross corr between time seriers
```{r}
title_logret <- "Daily log returns (%)"
par(mfrow = c(2, 2))
TSA::acf(marketclose_ret_cleaned, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(marketclose_ret_cleaned, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Daily abs log returns (%)"
TSA::acf(abs(marketclose_ret_cleaned), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(marketclose_ret_cleaned), na.action = na.pass, lag.max = 100, main = "")
# (P)ACF of squared daily returns
#par(mfrow = c(1, 2))
#TSA::acf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
#pacf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)

```

```{r}
title_logret <- "Feature1 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature1_diff_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature1_diff_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature1 abs log returns"
TSA::acf(abs(feature1_diff_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature1_diff_xts), na.action = na.pass, lag.max = 100, main = "")
```


```{r}
title_logret <- "Feature2 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature2_diff_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature2_diff_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature2 abs log returns"
TSA::acf(abs(feature2_diff_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature2_diff_xts), na.action = na.pass, lag.max = 100, main = "")
```

```{r}
title_logret <- "Feature3 log returns"
par(mfrow = c(2, 2))
TSA::acf(feature3_scaled_xts, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(feature3_scaled_xts, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Feature3 abs log returns"
TSA::acf(abs(feature3_scaled_xts), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(feature3_scaled_xts), na.action = na.pass, lag.max = 100, main = "")
```


## Plot ret vs f1 f2 f3
```{r}
par(mfrow =c(1,3) )

marketclose_ret_cleaned_ <- marketclose_ret_cleaned[-c(1)]
feature1_diff_xts_ <- feature1_diff_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature1_diff_xts_) ,xlab="Feature1",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature1_diff_xts_), f=0.1),
      col="red")

feature2_diff_xts_ <- feature2_diff_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned) ~ as.numeric(feature2_diff_xts) ,xlab="Feature2",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature2_diff_xts_), f=0.1),
      col="red")

feature3_scaled_xts_ <- feature3_scaled_xts[-c(1)]
plot(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature3_scaled_xts_) ,xlab="Feature3",
     ylab="Daily % log returns")
lines(lowess(as.numeric(marketclose_ret_cleaned_) ~ as.numeric(feature3_scaled_xts_), f=0.1),
      col="red")

```


```{r}
sel_fit <- auto.arima(marketclose_ret_cleaned_, xreg = cbind(feature2_diff_xts_  ,feature3_scaled_xts_))
```

```{r}
plot(resid(sel_fit))
```


```{r}
Box.test(resid(sel_fit), lag=100, type="Ljung")
```
```{r}
qqnorm(resid(sel_fit, type="innovation"))
```
```{r}
marketclose_ts <- ts(as.numeric(data_in_cleaned$MARKETCLOSE), start=date_ts_cleaned[1],frequency = 251.625)
marketclose_ts_decomp <- stl(log(marketclose_ts), s.window=21, robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
forecast::autoplot(marketclose_ts_decomp)
#TimeSeriesWeeklyDecomposed<-stl(ts_data , s.window="periodic")
```

```{r}
feature1_ts <- ts(as.numeric(feature1_xts), start=date_ts[1],frequency = 251.625)
feature1_ts_decomp <- stl(log(feature1_ts), s.window="periodic", robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(feature1_ts_decomp)
```

```{r}
feature2_ts <- ts(as.numeric(feature2_xts), start=date_ts[1],frequency = 251.625)
feature2_ts_decomp <- stl(log(feature2_ts), s.window="periodic", robust=TRUE)
#no_transform_2003456 <- mstl(weekly_avg_marketclose_2003456)
autoplot(feature2_ts_decomp)
```

```{r}
#monthplot(marketclose_ts_decomp,choice="seasonal")
#plot(seasonal(feature1_ts_decomp))
#lines(seasonal(marketclose_ts_decomp),col=2)
plot(forecast::remainder(marketclose_ts_decomp))
feature1_ts <- ts(data_in_cleaned$FEATURE_1,start = date_ts_cleaned[1],frequency = 251.625)
#lines(-(feature1_ts)/100,col=2)
feature2_ts <- ts(data_in_cleaned$FEATURE_2,start = date_ts_cleaned[1],frequency = 251.625)
#lines(-(feature2_ts)/100,col="green")
feature3_ts <- ts(data_in_cleaned$FEATURE_3,start = date_ts_cleaned[1],frequency = 251.625)
lines(((feature3_ts)/10000),col="blue")

#lines(((feature3_ts/1000))^{1/3}/10,col="blue")
#lines(remainder(marketclose_ts_decomp))
```


```{r}

resid_decomp <- forecast::remainder(marketclose_ts_decomp)
resid_decomp_logret <- diff((resid_decomp))
title_logret <- "Daily log returns (%)"
par(mfrow = c(2, 2))
TSA::acf(resid_decomp_logret, na.action = na.pass, lag.max = 100, main = title_logret)
pacf(resid_decomp_logret, na.action = na.pass,lag.max = 100, main = "")


# (P)ACF of absolute value of daily returns
#par(mfrow = c(1, 2))
title_abslogret <- "Daily abs log returns (%)"
TSA::acf(abs(resid_decomp_logret), na.action = na.pass, lag.max = 100, main = title_abslogret)
pacf(abs(resid_decomp_logret), na.action = na.pass, lag.max = 100, main = "")
# (P)ACF of squared daily returns
#par(mfrow = c(1, 2))
#TSA::acf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)
#pacf(I(marketclose_ret_cleaned^2), na.action = na.pass, main = title_sp)

```

```{r}
plot(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature2_ts)),xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature2_ts)), f=1/5),
      col="red")
grid(col=1,lwd=1.5)
```
```{r}
plot(as.numeric(resid_decomp_logret) ~ as.numeric((feature3_ts)/100)[-c(1)],xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric((feature3_ts)/100)[-c(1)], f=1/5),
      col="red")
grid(col=1,lwd=1.5)
```

```{r}
plot(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature1_ts)),xlim=c(-1,1))
lines(lowess(as.numeric(resid_decomp_logret) ~ as.numeric(diff(feature1_ts)), f=1/5),
      col="red")
grid(col=1,lwd=1.5)
```

```{r}
for_stl <- forecast(marketclose_ts_decomp, h = 21)
plot(for_stl)
```
```{r}
exp(for_stl$mean)
```


```{r}
tail(marketclose_ts)
```



#  Section III: Model Development and analysis (R)
#### Daily Volatility based Trend Following and Mean Reversion Switching

(Please clear the environment and all variables, starting with a clean slate from here.)
----------------------------------------------------------------------------------------------
I haven't considered the provided features in the data for this model development since I don't know what they represent, how they were extracted and whether or not they were lagged by one trading day. Further it seems all of them are endogenous (since they seem to be extracted from the price data), so they shouldn't have any extra 'information' (in information theoretical sense of the word) even though they might be better predictors. However, here I focus on interpretability of the strategy in terms of variables well known in financial trading.
 
 
To enable interpretability of the model, I use well-defined features or features representing well-defined quantities related to a stock price eg. volatility, moving averages, relative strength index etc. The model essentially switches between following a trend and mean reversion based on the stock volatility (relative to its historical values). 
 
The basic strategy is this: when the stock volatility is high relative to its usual volatility, I use mean reversion strategy based on RSI(2). When the volatility is low I follow the trend based on MA. In addition in slow volatility regime if the RSI(2) gets very high, this indicates oversold condition in which case I short. I further show improvement by updating the threshold parameters every 5 days by identifying best model parameters using grid search. Performance is judged on the basis of cumulative PNL wrt always buy strategy. The prediction for next also improves and is -1 from the updating strategy.
 
This strategy can be potentially further improved by using momentum, which I expect to be particularly strong during downturns. But it has to be balanced against being 'too-late' trying to time the stock price. Another potential improvement is to use volatility forecast using a Markov switching based GARCH model (since they seem to have been shown more accurate in predicting short term volatility <week than general GARCH models) or GARCH-ARIMA model. I use R here instead of python, however a small snippet of code to cleaup the data is in python3.

```{r}
rm(list=ls())
## Setup
## Get some functions/packages
## install.packages("RCurl")
require(RCurl)
sit = getURLContent('https://github.com/systematicinvestor/SIT/raw/master/sit.gz', binary=TRUE, followlocation = TRUE, ssl.verifypeer = FALSE)
con = gzcon(rawConnection(sit, 'rb'))
source(con)
close(con)
```

```{r}
## Get data, this is original file downloaded without any changes
setwd("~/Documents/interview_prep/Question1")
data_in <- read.csv(file = 'DataSet.csv')
date_ts <- as.Date(data_in$DATE, format = "%m/%d/%Y")
library(xts)
marketclose_xts <- xts(data_in$MARKETCLOSE,order.by = date_ts)
marketclose_ret <- 100*diff(log(marketclose_xts))

```

```{r}
#Estimate historical relative volatility
library(quantmod)
ret.log <- ROC(marketclose_xts, type='continuous')
hist.vol <- runSD(ret.log, n = 21)
vol.rank <- percent.rank(SMA(percent.rank(hist.vol, 252), 21), 250)
```


```{r}
## Mean reversion feature
rsi2_values <- RSI(marketclose_xts,2)

## Trend following feature
sma.short <-  SMA(marketclose_xts, 5)
sma.long <- SMA(marketclose_xts, 20)

```


```{r}
## Trading strategy: High Volatility - Mean reversion, Low volatility - follow trend
# long if vol.rank > 0.50 and rsi2_values < 50 (mean reversion) short otherwise
# short if vol.rank < 0.50 and 
# rsi2_values > 80 (overbought) or 5 week MA < 20 week MA
# long otherwise
# Add lag to predict next value from current estimates
# 
# `iif` is more stable version of `ifelse()` function in R to gracefully handle `NA` and `Inf` entries

sig <- Lag(iif(vol.rank > 0.50, 
               iif(rsi2_values < 50 , 1, -1),
               iif(sma.short < sma.long | rsi2_values > 80 , -1, 1)
))

```

```{r}
## Evaluate
# We evaluate the strategy against always buy strategy in column CUMSUM in data. Momentum is just the difference of successive entries. Noting here that sig is delayed by 1 trading day.
print('CUMSUM column from data:')
print(head(data_in$CUMSUM[-c(1)]))
print("Always buy strategy")
ret <- 1000*momentum(marketclose_xts)*1
print(head(cumsum(ret[-c(1)])))

```

```{r}
## Plot cumulative PNL
ret <- 1000*momentum(marketclose_xts)*sig
eq <- (cumsum(ret[-c(1)]))
plot(coredata(eq)~date_ts[-c(1)],type='l',xlab= "Year", ylab = "Cumulative PNL")
grid(col=1,lwd=1)
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
legend(x="topleft",legend=c("Volatility based switching","Buy Daily (Given)"),
       fill=c("black","red"))

```

```{r}
## Compare cumulative PNL at the end
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'Proposed Strategy'=tail(as.vector(eq)),row.names = as.character(tail(date_ts))))
```

```{r}
## Optimize thresholding parameters using grid search:
# Parameters: vol.rank_MIN, rsi2_values_MAX, rsi2_values_MIN
# sig.test <-  (iif(vol.rank.test > vol.rank_MIN, 
#                   iif((rsi2_values.test < rsi2_values_MAX) , 1, -1),
#                   iif(sma.short.test < sma.long.test | ((rsi2_values.test > rsi2_values_MIN)), -1, 1)

# we will use foreach for faster for loops
# automatic install of packages if they are not installed already
list.of.packages <- c(
  "foreach",
  "doParallel",
  "ranger",
  "palmerpenguins",
  "tidyverse",
  "kableExtra"
)

new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]

if(length(new.packages) > 0){
  install.packages(new.packages, dep=TRUE)
}

#loading packages
for(package.i in list.of.packages){
  suppressPackageStartupMessages(
    library(
      package.i, 
      character.only = TRUE
    )
  )
}

```


```{r}
# Create subset arrays to test how much these parameters change over time
# Every 5 day update of parameters
sub_ser.len <- 5
start_point <- 700
Tmax_array <- seq(start_point,length(marketclose_xts),sub_ser.len)

# Grid search for parametrs that maximize cumulative return over subset array
# and store the prediction for next day
return_max.test <- matrix(-Inf,length(Tmax_array),1)
predict_array.test <- matrix(1,length(Tmax_array),1)
max_par.test <- matrix(0,length(Tmax_array),3)
```



```{r}
# Read from saved file for retesting
max_par.test <- read.csv("max_par.test_5day700_30_rsi2MINrange.csv")[,c("V1","V2","V3")]
```



```{r}

print("Starting grid search. This might take a while (5-10 minutes)")
foreach (i =  1:length(Tmax_array)) %do% {
  print(paste("Updating threshold parameters for the week",i,"of",length(Tmax_array)))
  Tmax <- Tmax_array[i]
  marketclose_xts.test <- marketclose_xts[1:Tmax]
  marketclose_ret.test <- 100*diff(log(marketclose_xts.test))[-c(1)]
  #Historical volatility
  ret.log.test = ROC(marketclose_xts.test, type='continuous')
  hist.vol.test = runSD(ret.log.test, n = 21)
  vol.rank.test = percent.rank(SMA(percent.rank(hist.vol.test, 252), 21), 250)
  # Mean reversion
  rsi2_values.test = RSI(marketclose_xts.test,2)
  # Trend following
  sma.short.test <-  SMA(marketclose_xts.test, 5)
  sma.long.test <- SMA(marketclose_xts.test, 20)
  # Grid search
  foreach (vol.rank_MIN = seq(0.1,0.8,0.01)) %do% { 
    for (rsi2_values_MAX in seq(30,80,2)) {
      for (rsi2_values_MIN in 80){ #seq(30,80,10)) { # No effect of changes
        # Prediction signal calculation
        sig.test <-  (iif(vol.rank.test > vol.rank_MIN, 
                          iif((rsi2_values.test < rsi2_values_MAX) , 1, -1),
                          iif(sma.short.test < sma.long.test | ((rsi2_values.test > rsi2_values_MIN)), -1, 1)
                          
        ))
        # Save this for future use
        predict.test <- tail(sig.test,1)
        
        # Lag signal by one trading day
        sig.test <- Lag(sig.test)
        ret.test <- 1000*momentum(marketclose_xts.test)*sig.test
        
        return_total.test <- tail(cumsum(ret.test[-c(1)]),1)

        # Save if Cumulative return is more than current maximum
        if (return_total.test > return_max.test[i]) {
          
          return_max.test[i] <- return_total.test
          max_par.test[i,1] <- vol.rank_MIN
          max_par.test[i,2] <- rsi2_values_MAX
          max_par.test[i,3] <- rsi2_values_MIN
          predict_array.test[i] <- predict.test
          sig.test.max <- sig.test
          
        }
        
      }
      
    }
    
  }
}
# Save for fututre reuse
# write.csv(max_par.test,"max_par.test_5day700_30_rsi2MINrange.csv")

```

```{r}
# Print optimized parameters for each sub time series
print(max_par.test)
```

```{r}
optimized.pars_5days <- data.frame('vol.rank_MIN'=max_par.test[,1],'rsi2_values_MAX'=max_par.test[,2],'rsi2_values_MIN'=max_par.test[,3],row.names = date_ts[Tmax_array])
print(optimized.pars_5days)
# We will use values (0.67,40,80) for dates after and not including 2004-09-10 and before and including 2004-11-19 and so on.

optimized.pars_5days <- data.frame('vol.rank_MIN'=rep(max_par.test[1:length(Tmax_array),1],each=sub_ser.len),'rsi2_values_MAX'=rep(max_par.test[1:length(Tmax_array),2],each=sub_ser.len),'rsi2_values_MIN'=rep(max_par.test[1:length(Tmax_array),3],each=sub_ser.len))
print(dim((optimized.pars_5days)))
# pre-pend values for first 700 days
initial.pars_700days <- data.frame('vol.rank_MIN'=rep(.50,each=start_point),'rsi2_values_MAX'=rep(50,each=start_point),'rsi2_values_MIN'=rep(80,each=start_point))
optimized.pars_5days <- rbind(initial.pars_700days,optimized.pars_5days)
optimized.pars_5days <- optimized.pars_5days[1:length(marketclose_xts),]
print(head(optimized.pars_5days))
print(tail(optimized.pars_5days))
```


```{r}
## Evaluate performance of new parameters
## Optimized trading signal
sig_opt <- Lag(iif(vol.rank > optimized.pars_5days$vol.rank_MIN, 
                   iif(rsi2_values < optimized.pars_5days$rsi2_values_MAX  , 1, -1),
                   iif(sma.short < sma.long | rsi2_values > optimized.pars_5days$rsi2_values_MIN, -1, 1)
))
```


```{r}
##Plot of cumulative returns
ret_opt <- 1000*momentum(marketclose_xts)*sig_opt
eq_opt <- (cumsum(ret_opt[-c(1)]))
plot(coredata(eq_opt)~date_ts[-c(1)],type='l',col="green",xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw: Live trading + backtesting","Initail par switching backtesting","Buy Daily (Given)"),
       fill=c("green","black","red"))
#Clearly new parameters perform better
```

```{r}
# We can also use the entirety of time series for backtesting, to see what could have been the performance if we knew the best parameter estimates in the past (or try hit and trial/other optimization to improve overall performance with single set of parameters for all time points) and use it to further improve the update approach 
sig_opt_optimal <- Lag(iif(vol.rank > 0.2, 
                   iif(rsi2_values < 36  , 1, -1),
                   iif(sma.short < sma.long | rsi2_values > 80, -1, 1)
))
ret_opt_optimal <- 1000*momentum(marketclose_xts)*sig_opt_optimal
eq_opt_optimal <- (cumsum(ret_opt_optimal[-c(1)]))
plot(coredata(eq_opt_optimal)~date_ts[-c(1)],col="blue",type='l',xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt)~date_ts[-c(1)],col="green")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw Livetrading + Backtesting","Initial par switching: Backtesting","Buy Daily (Given)","Optimal par: Backtesting"),
       fill=c("green","black","red","blue"))
```

```{r}
#Compare cumulative PNL at the end after improvements
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'FixParSwitching'=tail(as.vector(eq)),'5dayUpdateSw'=tail(as.vector(eq_opt)),'Optimal'=tail(as.vector(eq_opt_optimal)),row.names = as.character(tail(date_ts))))

```
```{r}
# Predictions for next day using these parameters:
# Prediction from optimal will be same as opt since the parameters are the same
# on last day
lastday_row <- length(marketclose_ret)

prediction_fixed <- iif(vol.rank[lastday_row] > 0.50, 
                                   iif(rsi2_values[lastday_row] < 50 , 1, -1),
                                   iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values[lastday_row] > 80 , -1, 1)
)

prediction_opt <- iif(vol.rank[lastday_row] > optimized.pars_5days$vol.rank_MIN[lastday_row], 
                      iif(rsi2_values[lastday_row] < optimized.pars_5days$rsi2_values_MAX[lastday_row]  , 1, -1),
                      iif(sma.short[lastday_row] < sma.long[lastday_row] | rsi2_values > optimized.pars_5days$rsi2_values_MIN[lastday_row], -1, 1)
)

# Fixed parameter switching fails to predict correctly but 5 day update parameter update does correctly predict -1
print(paste0("Prediction for next day:  ",prediction_opt))
```

This was also evident in the exploration plots for weekday specific trends on returns

## Section III: Model Development and analysis: Part II
##  Weekday Specific Update

[Using weekday specific daily return volatility (weekly bars) for TF and MR switching]

```{r}
## Exploiting week of the day patterns in returns
marketclose_weekday <- base::weekdays(date_ts,abbreviate=TRUE)

## Modeling Monday returns
marketclose_ret_Mon <- marketclose_ret[marketclose_weekday == "Mon"]
plot(x=date_ts[marketclose_weekday == "Mon"],y=as.vector(marketclose_ret_Mon),ylim=c(-1.5,1.5),
     xlab="Year",ylab = "% Change",type = 'l',col="pink",
     main="% Weekly change in Monday daily log returns")
lines(lowess(as.vector(marketclose_ret_Mon) ~ date_ts[marketclose_weekday == "Mon"] ,f=1/2),col="red",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/4),col="blue",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/8),col="green",lwd=1)
lines(lowess(as.vector(marketclose_ret_Mon)  ~ date_ts[marketclose_weekday == "Mon"],f=1/12),col="black",lwd=1)
grid(col=1,lwd=1)
legend(x = "bottomleft",legend=c("% log ret", "f=1/2 lowess","f=1/4", "f=1/8","f=1/2"),
       fill = c("pink","red","blue","green","black"), bty='n')
```


```{r}
## Monday returns clearly show a visible pattern, we can create a time series for returns and use its predictability to improve the earlier forecast
## Switching strategy based on Monday returns:

## Mean reversion
rsi2_values_Mon = RSI(marketclose_ret_Mon/100,2)

## Trend following
sma.short_Mon <- SMA(marketclose_ret_Mon/100, 2)
sma.long_Mon <-  SMA(marketclose_ret_Mon/100, 5)

## Volatility in weekly bars of daily returns on Mondays
# Note that we had multiplied the changes by 100 when calculating `marketclose_ret`
library(quantmod)
ret.log_Mon = marketclose_ret_Mon/100
hist.vol_Mon = runSD(ret.log_Mon, n = 5)
vol.rank_Mon = percent.rank(SMA(percent.rank(hist.vol_Mon, 52), 5), 50)

```



```{r}
## Switching strategy for Monday returns
sig_Mon_ret <-  (Lag(Lag(iif(vol.rank_Mon > 0.50, 
                         iif((rsi2_values_Mon < 50) , 1, -1),
                         iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 80)), -1, 1)
))))

## Cumulative PNL plots for predicting daily returns on Mondays
ret_Mon <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig_Mon_ret
eq_Mon <- (cumsum(ret_Mon[-c(1,2)]))
x_var_Mon <- (date_ts[marketclose_weekday == "Mon"])[-c(1,2)]
plot(as.numeric(eq_Mon)~x_var_Mon,type='l',xlab="Year",ylab="Cumulative returns on Mondays",
     ylim=c(-10000,30000))

eq_opt_Mon <- as.numeric( cumsum( (ret_opt[marketclose_weekday == "Mon"])[-c(1,2)]) )
lines(eq_opt_Mon ~ x_var_Mon,col="green")

eg_givenStr <- as.numeric( cumsum( (1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"])[-c(1,2)]) )
lines(eg_givenStr ~ x_var_Mon,col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("Monday prediction","Combined switching","Buy every Monday (Given)"),
       fill=c("black","green","red"))
```


```{r}
## Optimizing parameters

# Create subset arrays to test how much these parameters change over time
# Every 10 week update of parameters
sub_ser.len.Mon <- 5
start_point.Mon <- 140
Tmax_array.Mon <- seq(start_point.Mon,length(marketclose_ret_Mon),sub_ser.len.Mon)

# Grid search for parameters that maximize cumulative return over subset time series
# and store the prediction for next day
return_max.test.Mon <- matrix(-Inf,length(Tmax_array.Mon),1)
predict_array.test.Mon <- matrix(1,length(Tmax_array.Mon),1)
max_par.test.Mon <- matrix(0,length(Tmax_array.Mon),3)
```


```{r}
# Or directly read from here
max_par.test.Mon <- read.csv("max_par.test.Mon.csv")[,c("V1","V2","V3")]
```


```{r}
print("Starting grid search. This might take a while (5-10 minutes)")
foreach (i =  1:length(Tmax_array.Mon)) %do% {
  print(paste("Updating threshold parameters for the 5 week Monday time series",i,"of",length(Tmax_array.Mon)))
  Tmax.Mon <- Tmax_array.Mon[i]
  marketclose_ret_Mon.test <- marketclose_ret_Mon[1:Tmax.Mon]
  # Differences in returns on Mondays
  marketclose_ret_ret_Mon.test <- 100*diff((marketclose_ret_Mon.test))[-c(1)]
  
  ## Volatility in weekly returns
  ret.log.test.Mon <- marketclose_ret_Mon.test/100
  hist.vol.test.Mon <- runSD(ret.log.test.Mon, n = 5)
  vol.rank.test.Mon <- percent.rank(SMA(percent.rank(hist.vol.test.Mon, 52), 5), 50)
  
  ## Mean reversion
  rsi2_values.test.Mon = RSI(marketclose_ret_Mon.test/100,2)
  ## Trend following
  sma.short.test.Mon <-  SMA(marketclose_ret_Mon.test/100, 2)
  sma.long.test.Mon <- SMA(marketclose_ret_Mon.test/100, 5)
  
  # Grid search for Monday parameters
  foreach (vol.rank.Mon_MIN = seq(0.3,0.8,0.01)) %do% {
    foreach (rsi2_values.Mon_MAX = seq(30,80,2)) %do% {
      for (rsi2_values.Mon_MIN in 60){ #seq(30,80,5))  %do%{ # doesn't change
        sig.test.Mon <-  (iif(vol.rank.test.Mon > vol.rank.Mon_MIN, 
                          iif((rsi2_values.test.Mon < rsi2_values.Mon_MAX) , 1, -1),
                          iif(sma.short.test.Mon < sma.long.test.Mon | ((rsi2_values.test.Mon > rsi2_values.Mon_MIN)), -1, 1)
                          
        ))
        predict.test.Mon <- tail(sig.test.Mon,1)
        sig.test.Mon <- Lag(Lag(sig.test.Mon))
        ret.test.Mon <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig.test.Mon
        #ret.test.Mon <- 1000*momentum(marketclose_ret_Mon.test)*sig.test.Mon
        
        return_total.test.Mon <- tail(cumsum(ret.test.Mon[-c(1,2)]),1)
        
        #Optimize cumulative return on Mondays
        if (return_total.test.Mon > return_max.test.Mon[i]) {
          
          return_max.test.Mon[i] <- return_total.test.Mon
          max_par.test.Mon[i,1] <- vol.rank.Mon_MIN
          max_par.test.Mon[i,2] <- rsi2_values.Mon_MAX
          max_par.test.Mon[i,3] <- rsi2_values.Mon_MIN
          predict_array.test.Mon[i] <- predict.test.Mon
          sig.test.Mon.max <- sig.test.Mon
          
        }
        
      }
      
    }
    
  }
}


#write.csv(max_par.test.Mon,"max_par.test.Mon.csv")
```

```{r}
print(max_par.test.Mon)
```

```{r}
# We use the optimal parameters for Monday update (0.44,62,60), to correct the earlier model
## Switching strategy for Monday returns
sig_opt.Mon <-  Lag(Lag(iif(vol.rank_Mon > 0.44, 
                             iif((rsi2_values_Mon < 62) , 1, -1),
                             iif(sma.short_Mon < sma.long_Mon | ((rsi2_values_Mon > 60)), -1, 1)
)))


##Plot of cumulative returns on Mondays after and before update
plot(as.numeric(eq_Mon)~x_var_Mon,type='l',xlab="Year",ylab="Cumulative returns on Mondays",
     ylim=c(-10000,30000))
lines(eq_opt_Mon ~ x_var_Mon,col="green")
lines(eg_givenStr ~ x_var_Mon,col="red")

ret_Mon_opt <- 1000*momentum(marketclose_xts)[marketclose_weekday == "Mon"]*sig_opt.Mon
eq_Mon_opt <- as.numeric( cumsum( (ret_Mon_opt)[-c(1,2)]) )
lines(eq_Mon_opt ~ x_var_Mon,col="blue")

grid(col=1,lwd=1)
legend(x="topleft",legend=c("5 week Monday update + 5 day update","Monday prediction","5 day update","Buy every Monday (Given)"),
       fill=c("blue","black","green","red"))


```

```{r}
# Correcting the earlier plots
sig_opt_optimal.MonCorrection <- sig_opt_optimal
sig_opt_optimal.MonCorrection[marketclose_weekday == "Mon"][-c(1,2)] <- sig_opt.Mon[-c(1,2)]

ret_opt_optimal.MonCorrection <- 1000*momentum(marketclose_xts)[-c(1,2)]*sig_opt_optimal.MonCorrection
eq_opt_optimal.MonCorrection <- as.numeric( cumsum( (ret_opt_optimal.MonCorrection) ))
```


```{r}
#Cumulative PNL plots for complete series after monday correction
plot(c(NA,eq_opt_optimal.MonCorrection)~date_ts[-c(1)],col="orange",type='l', ylim=c(0,140000),xlab= "Year", ylab = "Cumulative PNL")
lines(coredata(eq_opt_optimal)~date_ts[-c(1)],col="blue")
lines(coredata(eq_opt)~date_ts[-c(1)],col="green")
lines(coredata(eq)~date_ts[-c(1)])
lines(data_in$CUMSUM[-c(1)]~date_ts[-c(1)],col="red")
grid(col=1,lwd=1)
legend(x="topleft",legend=c("5day update sw","Fixed par switching","Buy Daily (Given)","Optimal","Optimal+Monday updates"),
       fill=c("green","black","red","blue","orange"))
```

```{r}
# Similarly we can check for weekday patterns on other days and also potential monthly or quarterly patterns

## Since next day on which prediction is needed also falls on a Monday, we can also verify the prediction from Monday series only:
lastweek_row.Mon <- length(marketclose_ret_Mon)
prediction_fixed.Mon <- iif(vol.rank_Mon[lastweek_row.Mon] > 0.50, 
                            iif(rsi2_values_Mon[lastweek_row.Mon] < 50 , 1, -1),
                            iif((sma.short_Mon[lastweek_row.Mon] < sma.long_Mon[lastweek_row.Mon]) | (rsi2_values_Mon[lastweek_row.Mon] > 80) , -1, 1)
)

prediction_opt <- iif(vol.rank_Mon[lastweek_row.Mon] > 0.44, 
                      iif(rsi2_values_Mon[lastweek_row.Mon] < 62  , 1, -1),
                      iif((sma.short_Mon[lastweek_row.Mon] < sma.long_Mon[lastweek_row.Mon])  | (rsi2_values_Mon[lastweek_row.Mon] > 60), -1, 1)
)

# Again fixed parameter switching fails to predict correctly but 2 week parameter update does correctly predict -1
print(paste0("Prediction for next day (2 week parameter update):  ",prediction_opt))

```

```{r}
Compare cumulative PNL at the end after Monday correction
print(data.frame('Given Strategy'=tail(data_in$CUMSUM[-c(1)]),'FixParSwitching'=tail(as.vector(eq)),'5dayUpdateSw'=tail(as.vector(eq_opt)),'Optimal'=tail(as.vector(eq_opt_optimal)),'Optimal&MonUpdate'=tail(as.vector(eq_opt_optimal.MonCorrection)),row.names = as.character(tail(date_ts))))

write.csv(cbind("sig_opt_optimal.MonCorrection"=as.numeric(sig_opt_optimal.MonCorrection),"sig_opt_optimal"=as.numeric(sig_opt_optimal),"sig_opt"=as.numeric(sig_opt),'sig'=as.numeric(sig)),"trading_signals.csv",quote=FALSE,row.names = date_ts)
```

